4— title: “proyecto” author: “alejandro” date: “2025-05-13” output: html_document — — title: “Proyecto_datos” output: html_document date: “2025-03-24” —
knitr::opts_chunk$set(echo = TRUE)
# Lista de paquetes necesarios
packages <- c(
"tidyverse", "ggplot2", "pander", "httr", "jsonlite", "dplyr", "stringr", "readr", "knitr", "tidyr",
"class", "caret", "FNN", "Metrics", "randomForest", "xgboost", "Matrix", "future", "pls",
"rsample", "recipes", "workflows", "tune", "parsnip", "dials"
)
# Instalar solo los que faltan
to_install <- setdiff(packages, rownames(installed.packages()))
if (length(to_install) > 0) install.packages(to_install, dependencies = TRUE)
# Cargar paquetes
invisible(lapply(packages, library, character.only = TRUE))
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'pander' was built under R version 4.4.3
## Warning: package 'httr' was built under R version 4.4.3
## Warning: package 'jsonlite' was built under R version 4.4.3
##
## Adjuntando el paquete: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
## Warning: package 'caret' was built under R version 4.4.3
## Cargando paquete requerido: lattice
##
## Adjuntando el paquete: 'caret'
##
## The following object is masked from 'package:httr':
##
## progress
##
## The following object is masked from 'package:purrr':
##
## lift
## Warning: package 'FNN' was built under R version 4.4.3
##
## Adjuntando el paquete: 'FNN'
##
## The following objects are masked from 'package:class':
##
## knn, knn.cv
## Warning: package 'Metrics' was built under R version 4.4.3
##
## Adjuntando el paquete: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Adjuntando el paquete: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
## Warning: package 'xgboost' was built under R version 4.4.3
##
## Adjuntando el paquete: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
##
##
## Adjuntando el paquete: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Warning: package 'future' was built under R version 4.4.3
##
## Adjuntando el paquete: 'future'
##
## The following object is masked from 'package:caret':
##
## cluster
## Warning: package 'pls' was built under R version 4.4.3
##
## Adjuntando el paquete: 'pls'
##
## The following object is masked from 'package:caret':
##
## R2
##
## The following object is masked from 'package:stats':
##
## loadings
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
##
## Adjuntando el paquete: 'recipes'
##
## The following object is masked from 'package:Matrix':
##
## update
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'dials' was built under R version 4.4.3
## Cargando paquete requerido: scales
## Warning: package 'scales' was built under R version 4.4.3
##
## Adjuntando el paquete: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Cargar archivos
df_distrito <- read.csv("datos_con_distrito.csv", sep = ",", stringsAsFactors = FALSE)
df_crimen <- read.csv("criminalidadbarcelona_2024.csv", sep = ";", stringsAsFactors = FALSE)
# Renombrar columnas
colnames(df_crimen)[colnames(df_crimen) == "Territorio"] <- "distrito"
colnames(df_crimen)[colnames(df_crimen) == "X2024"] <- "criminalidad_distrito"
# Equivalencias manuales
equivalencias <- c(
"L'Eixample" = "Eixample",
"Gracia" = "Gràcia",
"Distrito de les Corts" = "Les Corts",
"Distrito de Les Corts" = "Les Corts",
"Distrito de Sarrià-Sant Gervasi" = "Sarrià-Sant Gervasi",
"Distrito de Sants-Montjuïc" = "Sants-Montjuïc",
"Distrito de Nou Barris" = "Nou Barris"
)
df_distrito$distrito <- recode(df_distrito$distrito, !!!equivalencias)
# Filtrar solo distritos válidos
df_crimen <- df_crimen %>%
filter(`Tipo.de.territorio` == "Districte")
# Hacer el join
df_combinada <- left_join(df_distrito, df_crimen[, c("distrito", "criminalidad_distrito")], by = "distrito")
# Guardar resultado
write.csv(df_combinada, "datos_combinados.csv", row.names = FALSE)
head(df_combinada)
## id listing_url
## 1 18666 https://www.airbnb.com/rooms/18666
## 2 18674 https://www.airbnb.com/rooms/18674
## 3 21605 https://www.airbnb.com/rooms/21605
## 4 23197 https://www.airbnb.com/rooms/23197
## 5 25786 https://www.airbnb.com/rooms/25786
## 6 31377 https://www.airbnb.com/rooms/31377
## name
## 1 Flat with Sunny Terrace
## 2 Huge flat for 8 people close to Sagrada Familia
## 3 Nice and sunny duble room
## 4 FORUM DELUXE 5 MINS WALK CCIB CENTER & SEA!
## 5 NICE ROOM AVAILABLE IN THE HEART OF GRACIA
## 6 Room for 2, Sagrada Famili
## summary
## 1 Apartment located near the "Plaza de las Glorias" and the second-hand market (Encants). The accommodation is also close to the National Theatre of Catalunya and the Agbar Tower which has become one of the new symbols of Barcelona. Licence number: HUTB-(PHONE NUMBER HIDDEN)
## 2 110m2 apartment to rent in Barcelona. Located in the Eixample district, near the Sagrada Familia. It has a small balcony where you can see the temple of Gaudi. Capacity for 8 people. Licence number: HUTB-002062
## 3 The flat is in Poblenou district, and the room is a double room with a double bed, a wardrobe, a table, TV, wifi, heating and wood floor. Beautiful and charmy.
## 4 I do not accept groups of young people under 25, the apartment is not suitable for you, ideal for families and quiet people. Beautiful apartment, large terrace, 5 min walk CCIB center, sea, Port Forum. Great location for combining business with pleasure. After a long day at a conference, beach or sightseeing, sit out and relax on the large terrace for a quiet dinner or a nice cold glass of beer or wine away from the bustle and noise of Barcelona and tourist crowds.
## 5 JUST GO THROUGH THE MANY REVIEWS I GOT THROUGH THE YEARS, NO BETTER FEEDBACK THAN THAT. WELCOME.
## 6 The room in 500 m from Sagrada Familia. 3 branches of the subway in 7 minutes of walking. A straight line to the center and the beach. A safe area area with the developed infrastructure. Completely equipped kitchen, a washing machine, an air conditioner. Private bathroom. Wi-fi free.
## space
## 1 Nice apartment situated on the penthouse floor of a building with elevator. Huge Living/dining-room with double sofa-bed 1 bedroom with two single beds 1 bedroom with double bed Nice kitchen opened to the living/dining-room and fully equipped for 6 people Bathroom with shower The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with air-conditioning and heating.
## 2 Apartment with 110 m2 located in the 6th floor in a building with elevator Huge living/dinig-room 1 double bedrrom 1 bedroom with 2 single beds 1 bedroom with bunk beds Kitchen fully equipped for 8 people 1 bathroom with bathtub 1 small bathroom with shower balcony The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with heating, air conditioning and wifi.
## 3 L'apartament està al barri de Poblenou, i l'habitació te un llit doble de (Phone number hidden by Airbnb) armari, una taula i cadira d'estudi, TV, wifi, calefacció i terra de parquet. Bonica i encantadora. A prop de la platja, Glòries, Sagrada Família, museu del disseny, els encants, rambla del Poblenou, torre Agbar, Forum. L'habitació no té clau
## 4 Elegant spacious apartment suitable for 6. Ample lounge/dining area with AC, floor to ceiling sliding glass doors open out to the large balcony with dining table & chairs and wicker sofa. Master bedroom with ensuite bathroom. ceiling fan, built in wardrobes and view of the Tibidabo mountain and children´s play area. Office bedroom has bunk beds, 90x200 cm mattresses, pedestal ventilator, built in wardrobes and view of the Tibidabo mountain and childrens play area. Third bedroom, two single beds. fan, built in wardrobes and view of the Tibidabo mountain and childrens play area. Guest bathroom: walk-in shower, basin, bidet, WC. Fully equipped kitchen with access to the balcony; Refrigerator, freezer, halogen cooker, oven, microwave, Nespresso coffee machine, conventional coffee machine, toaster, electric juicer, kettle, dishwasher... Babies travel cot/high chair., AC only in the lounge, ceiling and pedestal fans in bedrooms. Laundry with washing machine and dryer, ironing equipment, F
## 5 Room available for rent.- PEDRO PEREZ. Shared with a Catalan male aged 38, Ayurvedic massage therapist and Yoga practitioner. Looking for people non-smoking, enthusiastic willing to share more than just the space in a centric beautiful flat in PLaça Vila de Gracia. i am very flexible you can use anything in the house feel free to ask anything! The neighborhood is really special you could live here and not needing anything from outside, such an experience, just 100 years ago was a village in the outskirts of barcelona, we do have our own cultural program throughout the year, very Catalan place. The area is full of bohemians, artisans and modern artists. Most of the area has been taken over by us over the past 10 years making it a mix between the past and the present-future. Metro stations around are: Diagonal L3-L5, Fontana L3, Joanic L4, 10-15 minutes walking to city center Ramblas. Separate Wardrobe room available Kitchen and bathroom shared Bills included available for renti
## 6 Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. Private bathroom. Laundry place and kitchen at your disposition. Wi-fi internet. ХОРОШИЙ РАЙОН С РАЗВИТОЙ ИНФРАСТРУКТУРОЙ. РЯДОМ САГРАДА ФАМИЛИЯ, САН ПАУ, ПАРК ГУЭЛЬ, АВЕНИДА ГАУДИ. ОТ МЕТРО SAN PAU 3 МИН. ОТ ALFONS X 5 МИН - ПРЯМАЯ ВЕТКА ДО ПЛЯЖА. ЕВРОПЕЙСКИЕ УСЛОВИЯ. КОНДИЦИОНЕРЫ. БЕСПЛАТНЫЙ ИНТЕРНЕТ WI-FI ADSL. КОМПЬЮТЕР ПОЛНОСТЬЮ ОБОРУДОВАННАЯ КУХНЯ С ПОСУДОЙ. ЧАЙ, КОФЕ, САХАР - БЕСПЛАТНО. СТИРАЛЬНАЯ МАШИНА, УТЮГ, ОБОГРЕВ И Т.Д. ВЫДАЕМ ПОСТЕЛЬНОЕ БЕЛЬЕ, ПОЛОТЕНЦА, ВКЛЮЧАЯ ПЛЯЖНЫЕ, ФЕН, ШАМПУНЬ, ГЕЛЬ ДЛЯ ДУША.
## description
## 1 Apartment located near the "Plaza de las Glorias" and the second-hand market (Encants). The accommodation is also close to the National Theatre of Catalunya and the Agbar Tower which has become one of the new symbols of Barcelona. Licence number: HUTB-(PHONE NUMBER HIDDEN) Nice apartment situated on the penthouse floor of a building with elevator. Huge Living/dining-room with double sofa-bed 1 bedroom with two single beds 1 bedroom with double bed Nice kitchen opened to the living/dining-room and fully equipped for 6 people Bathroom with shower The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with air-conditioning and heating. Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included. We can provide you all kind of entrance and tickets for monuments and shows in Barcelona in order you avo
## 2 110m2 apartment to rent in Barcelona. Located in the Eixample district, near the Sagrada Familia. It has a small balcony where you can see the temple of Gaudi. Capacity for 8 people. Licence number: HUTB-002062 Apartment with 110 m2 located in the 6th floor in a building with elevator Huge living/dinig-room 1 double bedrrom 1 bedroom with 2 single beds 1 bedroom with bunk beds Kitchen fully equipped for 8 people 1 bathroom with bathtub 1 small bathroom with shower balcony The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with heating, air conditioning and wifi. Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included. We can provide you all kind of entrance and tickets for monuments and shows in Barcelona in order you avoid queues and plan your trip in advance. Also we can organize sh
## 3 The flat is in Poblenou district, and the room is a double room with a double bed, a wardrobe, a table, TV, wifi, heating and wood floor. Beautiful and charmy. L'apartament està al barri de Poblenou, i l'habitació te un llit doble de (Phone number hidden by Airbnb) armari, una taula i cadira d'estudi, TV, wifi, calefacció i terra de parquet. Bonica i encantadora. A prop de la platja, Glòries, Sagrada Família, museu del disseny, els encants, rambla del Poblenou, torre Agbar, Forum. L'habitació no té clau The kitchen is fully equipped and can use the washer and dryer. We also have a beautiful balcony on the apartment. And, of course, you can use the bathroom and the living and dining room. My husband and I will be available in person or by phone/ (Hidden by Airbnb) for any questions you have during your stay. Poblenou as one of the few areas that has grown independently, keeping away from fleeting trends and maintaining its identity. As a result it has become one of the most genuine and
## 4 I do not accept groups of young people under 25, the apartment is not suitable for you, ideal for families and quiet people. Beautiful apartment, large terrace, 5 min walk CCIB center, sea, Port Forum. Great location for combining business with pleasure. After a long day at a conference, beach or sightseeing, sit out and relax on the large terrace for a quiet dinner or a nice cold glass of beer or wine away from the bustle and noise of Barcelona and tourist crowds. Elegant spacious apartment suitable for 6. Ample lounge/dining area with AC, floor to ceiling sliding glass doors open out to the large balcony with dining table & chairs and wicker sofa. Master bedroom with ensuite bathroom. ceiling fan, built in wardrobes and view of the Tibidabo mountain and children´s play area. Office bedroom has bunk beds, 90x200 cm mattresses, pedestal ventilator, built in wardrobes and view of the Tibidabo mountain and childrens play area. Third bedroom, two single beds. fan, built in wardrobes a
## 5 JUST GO THROUGH THE MANY REVIEWS I GOT THROUGH THE YEARS, NO BETTER FEEDBACK THAN THAT. WELCOME. Room available for rent.- PEDRO PEREZ. Shared with a Catalan male aged 38, Ayurvedic massage therapist and Yoga practitioner. Looking for people non-smoking, enthusiastic willing to share more than just the space in a centric beautiful flat in PLaça Vila de Gracia. i am very flexible you can use anything in the house feel free to ask anything! The neighborhood is really special you could live here and not needing anything from outside, such an experience, just 100 years ago was a village in the outskirts of barcelona, we do have our own cultural program throughout the year, very Catalan place. The area is full of bohemians, artisans and modern artists. Most of the area has been taken over by us over the past 10 years making it a mix between the past and the present-future. Metro stations around are: Diagonal L3-L5, Fontana L3, Joanic L4, 10-15 minutes walking to city center Ramblas. S
## 6 The room in 500 m from Sagrada Familia. 3 branches of the subway in 7 minutes of walking. A straight line to the center and the beach. A safe area area with the developed infrastructure. Completely equipped kitchen, a washing machine, an air conditioner. Private bathroom. Wi-fi free. Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. Private bathroom. Laundry place and kitchen at your disposition. Wi-fi internet. ХОРОШИЙ РАЙОН С РАЗВИТОЙ ИНФРАСТРУКТУРОЙ. РЯДОМ САГРАДА ФАМИЛИЯ, САН ПАУ, ПАРК ГУЭЛЬ, АВЕНИДА ГАУДИ. ОТ МЕТРО SAN PAU 3 МИН. ОТ ALFONS X 5 МИН - ПРЯМАЯ ВЕТКА ДО ПЛЯЖА. ЕВРОПЕЙСКИЕ УСЛОВИЯ. КОНДИЦИОНЕРЫ. БЕСПЛАТНЫЙ ИНТЕРНЕТ WI-FI ADSL. КОМПЬЮТЕР ПОЛНОСТЬЮ ОБОРУДОВАННАЯ КУХНЯ С ПОСУДОЙ. ЧАЙ, КОФЕ, САХАР - БЕСПЛАТНО. СТИРАЛЬНАЯ МАШИНА, УТЮГ, ОБОГРЕВ И Т.Д. ВЫДАЕМ ПОСТЕЛЬНОЕ БЕЛЬЕ, ПОЛОТЕНЦА, ВК
## neighborhood_overview
## 1 Apartment in Barcelona near to the Plaza de las Glorias, the old market (Encants), the Agbar Tower one of the new symbols of Barcelona and the Teatre Nacional de Catalunya. All kinds of services in surroundings (shops, supermarkets, restaurants, bars).
## 2 Apartment in Barcelona located in the heart of Eixample district, within only 150 m form the great Sagrada Familia and really near of Gaudí Avenue and the famous Sant Pau Hospital . All kind of services in surroundings (shops, supermarkets, restaurants, bars).
## 3 Poblenou as one of the few areas that has grown independently, keeping away from fleeting trends and maintaining its identity. As a result it has become one of the most genuine and prolific metropolitan scenarios of Barcelona city. In recent years, a series of creative hubs have found their home in Poblenou, cultural and commercial spaces that offer similar innovative proposals, becoming part of the neighbourhood’s future without giving up its industrial past. To the mission of the neighbourhood’s normalization, the work of the entrepreneurs has been added, raising the area’s value and adding it to the map of alternative cultural circuits. See more info in (Website hidden by Airbnb)
## 4 Strategically located in the area of Parc del Fòrum, a spacious area where all kinds of events and events are held. It is an area reclaimed by the sea where you can find: the Esplanade, where fairs, music festivals or large events are held; the Fòrum building, triangular in shape and the undisputed icon of the new architecture of Barcelona; the CCIB-Center de Convencions Internacional de Barcelona,; the Parc dels Auditoris, a large outdoor space in front of the sea; the spectacular photovoltaic plate, inclined and suspended over very peculiar columns, the Fòrum Marina with mega yachts and the Forum safe bathing area with access for wheelchairs ,where you can savor the genuine and Mediterranean character of the city. Great area also for walking, cycling, running..... A few minutes walk to the Diagonal Mar shopping center, frequent transportation takes you to the historic center in about 10 minutes by metro or tram T4 to the Olympic Port, etc.
## 5 Solo decir que a menudo ni salgo del barrio. Muy entretenido con sus gentes y lugares.
## 6
## access
## 1 Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included.
## 2 Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included.
## 3 The kitchen is fully equipped and can use the washer and dryer. We also have a beautiful balcony on the apartment. And, of course, you can use the bathroom and the living and dining room.
## 4 You book the entire apartment for yourselves.
## 5 All access with respect. Kitchen facilities need permission. Feel free to ask. Avoid Noise after midnight and early.morning
## 6
## picture_url
## 1 https://a0.muscache.com/im/pictures/47f88bc6-6561-445a-beec-f8ec4ddc1038.jpg?aki_policy=large
## 2 https://a0.muscache.com/im/pictures/13031453/413cdbfc_original.jpg?aki_policy=large
## 3 https://a0.muscache.com/im/pictures/774ca73d-13f4-4848-83c9-965d8332af8a.jpg?aki_policy=large
## 4 https://a0.muscache.com/im/pictures/738532/806da1bf_original.jpg?aki_policy=large
## 5 https://a0.muscache.com/im/pictures/6619f0c7-844e-40a1-8521-44c19b7a4af2.jpg?aki_policy=large
## 6 https://a0.muscache.com/im/pictures/ac805ead-12f0-4ebe-89b3-53ea9ede132f.jpg?aki_policy=large
## host_id host_url host_name host_since
## 1 71615 https://www.airbnb.com/users/show/71615 Mireia And Maria 19/01/2010
## 2 71615 https://www.airbnb.com/users/show/71615 Mireia And Maria 19/01/2010
## 3 82522 https://www.airbnb.com/users/show/82522 Meritxell 18/02/2010
## 4 90417 https://www.airbnb.com/users/show/90417 Etain (Marnie) 09/03/2010
## 5 108310 https://www.airbnb.com/users/show/108310 Pedro 14/04/2010
## 6 134698 https://www.airbnb.com/users/show/134698 Svetlana 29/05/2010
## host_response_time host_response_rate host_is_superhost
## 1 within an hour 99% f
## 2 within an hour 99% f
## 3 within a few hours 100% f
## 4 within an hour 100% t
## 5 within an hour 100% t
## 6 within an hour 100% f
## host_picture_url
## 1 https://a0.muscache.com/im/users/71615/profile_pic/1426612511/original.jpg?aki_policy=profile_x_medium
## 2 https://a0.muscache.com/im/users/71615/profile_pic/1426612511/original.jpg?aki_policy=profile_x_medium
## 3 https://a0.muscache.com/im/pictures/ece65ffd-a798-4209-b1b0-a51060412b29.jpg?aki_policy=profile_x_medium
## 4 https://a0.muscache.com/im/users/90417/profile_pic/1300298768/original.jpg?aki_policy=profile_x_medium
## 5 https://a0.muscache.com/im/pictures/user/2b13f530-a8dd-4777-93a5-a133ac46b97d.jpg?aki_policy=profile_x_medium
## 6 https://a0.muscache.com/im/users/134698/profile_pic/1334849467/original.jpg?aki_policy=profile_x_medium
## host_neighbourhood host_listings_count
## 1 El Camp de l'Arpa del Clot 45
## 2 El Camp de l'Arpa del Clot 45
## 3 El Poblenou 2
## 4 El Besòs i el Maresme 5
## 5 Vila de Gràcia 1
## 6 El Baix Guinardó 9
## host_verifications
## 1 ['email', 'phone', 'reviews', 'jumio', 'government_id']
## 2 ['email', 'phone', 'reviews', 'jumio', 'government_id']
## 3 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'government_id']
## 4 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']
## 5 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']
## 6 ['email', 'phone', 'reviews']
## host_has_profile_pic host_identity_verified
## 1 t t
## 2 t t
## 3 t t
## 4 t t
## 5 t t
## 6 t f
## street neighbourhood
## 1 Barcelona, CT, Spain Sant Martí
## 2 Barcelona, CT, Spain La Sagrada Família
## 3 Barcelona, Catalunya, Spain Sant Martí
## 4 Sant Adria de Besos, Barcelona, Spain Sant Martí
## 5 Barcelona, Barcelona, Spain Vila de Gràcia
## 6 Barcelona, CT, Spain Horta-Guinardó
## neighbourhood_cleansed neighbourhood_group_cleansed city
## 1 el Camp de l'Arpa del Clot Sant Martí Barcelona
## 2 la Sagrada Família Eixample Barcelona
## 3 el Poblenou Sant Martí Barcelona
## 4 el Besòs i el Maresme Sant Martí Sant Adria de Besos
## 5 la Vila de Gràcia Gràcia Barcelona
## 6 el Baix Guinardó Horta-Guinardó Barcelona
## zipcode country latitude longitude is_location_exact property_type
## 1 8026 Spain 41.40889 2.18555 t Apartment
## 2 8025 Spain 41.40420 2.17306 t Apartment
## 3 8018 Spain 41.40560 2.19821 t Apartment
## 4 8930 Spain 41.41203 2.22114 f Apartment
## 5 8012 Spain 41.40145 2.15645 t Apartment
## 6 8025 Spain 41.41097 2.17070 t Apartment
## room_type accommodates bathrooms bedrooms beds
## 1 Entire home/apt 6 1 2 4
## 2 Entire home/apt 8 2 3 6
## 3 Private room 2 1 1 1
## 4 Entire home/apt 6 2 3 8
## 5 Private room 2 1 1 1
## 6 Private room 2 1 1 2
## amenities
## 1 {TV,Internet,Wifi,"Air conditioning","Wheelchair accessible",Kitchen,Elevator,"Free street parking",Heating,"Family/kid friendly",Washer,Dryer,Essentials,Shampoo,"Hair dryer","Hot water","Host greets you","Paid parking on premises"}
## 2 {TV,Internet,Wifi,"Air conditioning","Wheelchair accessible",Kitchen,Elevator,"Free street parking","Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace",Crib,"Hot water","Host greets you","Paid parking on premises"}
## 3 {TV,Wifi,Kitchen,"Paid parking off premises",Elevator,Heating,"Family/kid friendly",Washer,Dryer,"First aid kit",Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace","Self check-in","Smart lock","Hot water","Bed linens","Extra pillows and blankets",Microwave,"Coffee maker",Refrigerator,Dishwasher,"Dishes and silverware","Cooking basics",Oven,Stove,"Patio or balcony","Luggage dropoff allowed","Cleaning before checkout","No stairs or steps to enter","Wide entrance for guests","Flat path to guest entrance","Well-lit path to entrance","No stairs or steps to enter","No stairs or steps to enter","No stairs or steps to enter","Wide entryway","Paid parking on premises"}
## 4 {TV,Internet,Wifi,"Wheelchair accessible",Kitchen,"Paid parking off premises",Elevator,"Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Dryer,"Smoke detector","Carbon monoxide detector","Fire extinguisher",Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace","High chair",Crib,"Pack ’n Play/travel crib","Hot water","Bed linens",Microwave,"Coffee maker",Refrigerator,Dishwasher,"Dishes and silverware","Cooking basics",Oven,Stove,"Patio or balcony","Luggage dropoff allowed","Long term stays allowed","Wide hallways","No stairs or steps to enter","Wide entrance for guests","Flat path to guest entrance","Well-lit path to entrance","No stairs or steps to enter","Wide entryway","Host greets you","Paid parking on premises"}
## 5 {TV,Wifi,"Air conditioning",Kitchen,"Smoking allowed",Elevator,Heating,"Family/kid friendly",Washer,"Fire extinguisher",Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer","Hot water","Luggage dropoff allowed"}
## 6 {Wifi,"Air conditioning",Kitchen,"Paid parking off premises","Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer",Iron,"translation missing: en.hosting_amenity_50","Bed linens","Extra pillows and blankets",Microwave,"Coffee maker",Refrigerator,"Dishes and silverware","Cooking basics",Oven,Stove,"Luggage dropoff allowed","Host greets you"}
## square_feet price cleaning_fee minimum_nights maximum_nights
## 1 75 $130.00 $42.00 3 730
## 2 NA $60.00 $50.00 1 1125
## 3 108 $33.00 2 1125
## 4 NA $210.00 $80.00 3 1125
## 5 NA $45.00 1 730
## 6 NA $42.00 3 1125
## has_availability availability_30 availability_60 availability_90
## 1 t 0 0 0
## 2 t 3 20 50
## 3 t 4 8 15
## 4 t 11 33 63
## 5 t 8 19 41
## 6 t 5 8 16
## availability_365 number_of_reviews number_of_reviews_ltm first_review
## 1 182 1 0 10/10/2015
## 2 129 15 10 27/05/2013
## 3 15 119 36 08/05/2016
## 4 318 45 16 15/03/2011
## 5 115 241 49 11/08/2010
## 6 211 4 0 20/05/2015
## last_review review_scores_rating review_scores_accuracy
## 1 10/10/2015 80 10
## 2 02/07/2019 87 9
## 3 04/07/2019 90 10
## 4 07/07/2019 95 10
## 5 03/07/2019 95 10
## 6 12/03/2018 95 9
## review_scores_cleanliness review_scores_checkin review_scores_communication
## 1 10 2 10
## 2 9 10 10
## 3 9 10 10
## 4 10 10 10
## 5 10 10 10
## 6 10 10 10
## review_scores_location review_scores_value instant_bookable reviews_per_month
## 1 10 8 f 0.02
## 2 9 8 t 0.20
## 3 9 9 f 3.08
## 4 9 9 t 0.44
## 5 10 9 t 2.22
## 6 9 9 f 0.08
## distrito criminalidad_distrito
## 1 Sant Martí 25408
## 2 Eixample 46754
## 3 Sant Martí 25408
## 4 Sant Martí 25408
## 5 Gràcia 8588
## 6 Horta-Guinardó 10057
data = read.csv("datos_combinados.csv", sep=",", stringsAsFactors = FALSE)
names(data)
## [1] "id" "listing_url"
## [3] "name" "summary"
## [5] "space" "description"
## [7] "neighborhood_overview" "access"
## [9] "picture_url" "host_id"
## [11] "host_url" "host_name"
## [13] "host_since" "host_response_time"
## [15] "host_response_rate" "host_is_superhost"
## [17] "host_picture_url" "host_neighbourhood"
## [19] "host_listings_count" "host_verifications"
## [21] "host_has_profile_pic" "host_identity_verified"
## [23] "street" "neighbourhood"
## [25] "neighbourhood_cleansed" "neighbourhood_group_cleansed"
## [27] "city" "zipcode"
## [29] "country" "latitude"
## [31] "longitude" "is_location_exact"
## [33] "property_type" "room_type"
## [35] "accommodates" "bathrooms"
## [37] "bedrooms" "beds"
## [39] "amenities" "square_feet"
## [41] "price" "cleaning_fee"
## [43] "minimum_nights" "maximum_nights"
## [45] "has_availability" "availability_30"
## [47] "availability_60" "availability_90"
## [49] "availability_365" "number_of_reviews"
## [51] "number_of_reviews_ltm" "first_review"
## [53] "last_review" "review_scores_rating"
## [55] "review_scores_accuracy" "review_scores_cleanliness"
## [57] "review_scores_checkin" "review_scores_communication"
## [59] "review_scores_location" "review_scores_value"
## [61] "instant_bookable" "reviews_per_month"
## [63] "distrito" "criminalidad_distrito"
na_summary <- data %>%
summarise(across(everything(), ~ mean(is.na(.) | . == "") * 100)) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "na_percentage") %>%
mutate(na_percentage = round(na_percentage, 2)) %>%
arrange(desc(na_percentage))
kable(na_summary, caption = "Porcentaje de Valores faltantes o nulos por Variable")
| variable | na_percentage |
|---|---|
| square_feet | 97.36 |
| access | 46.03 |
| neighborhood_overview | 34.26 |
| space | 26.39 |
| review_scores_checkin | 24.50 |
| review_scores_location | 24.49 |
| review_scores_value | 24.49 |
| review_scores_accuracy | 24.48 |
| review_scores_cleanliness | 24.48 |
| review_scores_communication | 24.46 |
| review_scores_rating | 24.44 |
| first_review | 23.28 |
| last_review | 23.28 |
| reviews_per_month | 23.28 |
| cleaning_fee | 20.52 |
| host_neighbourhood | 19.56 |
| host_response_time | 12.70 |
| host_response_rate | 12.70 |
| summary | 5.09 |
| zipcode | 3.05 |
| description | 1.82 |
| criminalidad_distrito | 0.53 |
| host_name | 0.17 |
| host_since | 0.17 |
| host_is_superhost | 0.17 |
| host_picture_url | 0.17 |
| host_listings_count | 0.17 |
| host_verifications | 0.17 |
| host_has_profile_pic | 0.17 |
| host_identity_verified | 0.17 |
| beds | 0.17 |
| name | 0.09 |
| neighbourhood | 0.05 |
| bathrooms | 0.05 |
| city | 0.02 |
| bedrooms | 0.02 |
| distrito | 0.01 |
| id | 0.00 |
| listing_url | 0.00 |
| picture_url | 0.00 |
| host_id | 0.00 |
| host_url | 0.00 |
| street | 0.00 |
| neighbourhood_cleansed | 0.00 |
| neighbourhood_group_cleansed | 0.00 |
| country | 0.00 |
| latitude | 0.00 |
| longitude | 0.00 |
| is_location_exact | 0.00 |
| property_type | 0.00 |
| room_type | 0.00 |
| accommodates | 0.00 |
| amenities | 0.00 |
| price | 0.00 |
| minimum_nights | 0.00 |
| maximum_nights | 0.00 |
| has_availability | 0.00 |
| availability_30 | 0.00 |
| availability_60 | 0.00 |
| availability_90 | 0.00 |
| availability_365 | 0.00 |
| number_of_reviews | 0.00 |
| number_of_reviews_ltm | 0.00 |
| instant_bookable | 0.00 |
# 1. Crear copia de trabajo
data_clean <- data
# 2. Eliminar columnas con más del 40% de valores faltantes, salvo excepciones
na_porcentaje <- sapply(data_clean, function(x) mean(is.na(x) | x == ""))
columnas_protegidas <- c("criminalidad_distrito")
columnas_a_eliminar <- setdiff(names(na_porcentaje[na_porcentaje > 0.4]), columnas_protegidas)
data_clean <- data_clean[, !(names(data_clean) %in% columnas_a_eliminar)]
# 3. Imputar variables categóricas con "Faltante"
variables_categoricas <- names(data_clean)[sapply(data_clean, function(x) is.character(x) | is.factor(x))]
for (var in variables_categoricas) {
data_clean[[var]][is.na(data_clean[[var]]) | data_clean[[var]] == ""] <- "Faltante"
}
# 4. Eliminar filas con NA en 'id' o 'price', o "Faltante" en 'name'
data_clean <- data_clean %>%
filter(!is.na(id), !is.na(price), name != "Faltante")
# 5. Verificación
cat("✅ Columnas eliminadas por superar 40% de NA:\n")
## ✅ Columnas eliminadas por superar 40% de NA:
print(columnas_a_eliminar)
## [1] "access" "square_feet"
cat("\n✅ Dimensiones finales del dataset limpio:\n")
##
## ✅ Dimensiones finales del dataset limpio:
cat("Filas:", nrow(data_clean), "| Columnas:", ncol(data_clean), "\n")
## Filas: 19816 | Columnas: 62
cat("\n📊 Total de valores NA restantes en el dataset:\n")
##
## 📊 Total de valores NA restantes en el dataset:
print(sum(is.na(data_clean)))
## [1] 38698
# 1. Variables categóricas: porcentaje de "Faltante"
variables_categoricas <- names(data_clean)[sapply(data_clean, function(x) is.character(x) | is.factor(x))]
faltantes_cat <- data_clean %>%
summarise(across(all_of(variables_categoricas), ~ mean(. == "Faltante") * 100)) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "pct_faltante") %>%
arrange(desc(pct_faltante))
kable(faltantes_cat, caption = "📊 Porcentaje de 'Faltante' en variables categóricas", digits = 2)
| variable | pct_faltante |
|---|---|
| neighborhood_overview | 34.22 |
| space | 26.36 |
| first_review | 23.25 |
| last_review | 23.25 |
| cleaning_fee | 20.48 |
| host_neighbourhood | 19.55 |
| host_response_time | 12.64 |
| host_response_rate | 12.64 |
| summary | 5.09 |
| zipcode | 3.05 |
| description | 1.82 |
| host_name | 0.17 |
| host_since | 0.17 |
| host_is_superhost | 0.17 |
| host_picture_url | 0.17 |
| host_verifications | 0.17 |
| host_has_profile_pic | 0.17 |
| host_identity_verified | 0.17 |
| neighbourhood | 0.05 |
| city | 0.02 |
| distrito | 0.01 |
| listing_url | 0.00 |
| name | 0.00 |
| picture_url | 0.00 |
| host_url | 0.00 |
| street | 0.00 |
| neighbourhood_cleansed | 0.00 |
| neighbourhood_group_cleansed | 0.00 |
| country | 0.00 |
| is_location_exact | 0.00 |
| property_type | 0.00 |
| room_type | 0.00 |
| amenities | 0.00 |
| price | 0.00 |
| has_availability | 0.00 |
| instant_bookable | 0.00 |
# Barplot para variables categóricas con "Faltante"
faltantes_cat %>%
filter(pct_faltante > 0) %>%
ggplot(aes(x = reorder(variable, pct_faltante), y = pct_faltante)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Porcentaje de 'Faltante' en variables categóricas",
x = "Variable", y = "% de 'Faltante'")
# 2. Variables numéricas: porcentaje de NA
variables_numericas <- names(data_clean)[sapply(data_clean, is.numeric)]
faltantes_num <- data_clean %>%
summarise(across(all_of(variables_numericas), ~ mean(is.na(.)) * 100)) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "pct_na") %>%
arrange(desc(pct_na))
kable(faltantes_num, caption = "📊 Porcentaje de NA en variables numéricas", digits = 2)
| variable | pct_na |
|---|---|
| review_scores_checkin | 24.47 |
| review_scores_location | 24.46 |
| review_scores_value | 24.45 |
| review_scores_accuracy | 24.44 |
| review_scores_cleanliness | 24.44 |
| review_scores_communication | 24.42 |
| review_scores_rating | 24.40 |
| reviews_per_month | 23.25 |
| criminalidad_distrito | 0.53 |
| host_listings_count | 0.17 |
| beds | 0.17 |
| bathrooms | 0.05 |
| bedrooms | 0.02 |
| id | 0.00 |
| host_id | 0.00 |
| latitude | 0.00 |
| longitude | 0.00 |
| accommodates | 0.00 |
| minimum_nights | 0.00 |
| maximum_nights | 0.00 |
| availability_30 | 0.00 |
| availability_60 | 0.00 |
| availability_90 | 0.00 |
| availability_365 | 0.00 |
| number_of_reviews | 0.00 |
| number_of_reviews_ltm | 0.00 |
# Barplot para variables numéricas con NA
faltantes_num %>%
filter(pct_na > 0) %>%
ggplot(aes(x = reorder(variable, pct_na), y = pct_na)) +
geom_col(fill = "firebrick") +
coord_flip() +
labs(title = "Porcentaje de NA en variables numéricas",
x = "Variable", y = "% de NA")
# Distribución por distritos
vecindarios <- data %>%
count(neighbourhood, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2))
kable(vecindarios)
| neighbourhood | n | pct |
|---|---|---|
| Eixample | 3758 | 18.95 |
| Ciutat Vella | 2455 | 12.38 |
| Sants-Montjuïc | 1843 | 9.29 |
| Sant Martí | 1142 | 5.76 |
| Gràcia | 883 | 4.45 |
| Dreta de l’Eixample | 867 | 4.37 |
| El Raval | 830 | 4.18 |
| El Gòtic | 533 | 2.69 |
| Vila de Gràcia | 528 | 2.66 |
| La Nova Esquerra de l’Eixample | 489 | 2.47 |
| Sant Antoni | 488 | 2.46 |
| El Poble-sec | 456 | 2.30 |
| L’Antiga Esquerra de l’Eixample | 432 | 2.18 |
| La Sagrada Família | 417 | 2.10 |
| Sarrià-Sant Gervasi | 376 | 1.90 |
| Sant Pere/Santa Caterina | 372 | 1.88 |
| Les Corts | 330 | 1.66 |
| Horta-Guinardó | 297 | 1.50 |
| La Barceloneta | 248 | 1.25 |
| el Fort Pienc | 234 | 1.18 |
| El Poblenou | 230 | 1.16 |
| Sant Gervasi - Galvany | 194 | 0.98 |
| Sant Andreu | 180 | 0.91 |
| Camp d’en Grassot i Gràcia Nova | 166 | 0.84 |
| Nou Barris | 164 | 0.83 |
| El Camp de l’Arpa del Clot | 149 | 0.75 |
| El Born | 138 | 0.70 |
| Glòries - El Parc | 137 | 0.69 |
| Diagonal Mar - La Mar Bella | 109 | 0.55 |
| La Vila Olímpica | 104 | 0.52 |
| El Putget i Farró | 103 | 0.52 |
| Guinardó | 101 | 0.51 |
| El Besòs i el Maresme | 100 | 0.50 |
| El Baix Guinardó | 88 | 0.44 |
| La Maternitat i Sant Ramon | 79 | 0.40 |
| El Clot | 77 | 0.39 |
| Vallcarca i els Penitents | 65 | 0.33 |
| La Salut | 56 | 0.28 |
| Navas | 52 | 0.26 |
| Carmel | 49 | 0.25 |
| Provençals del Poblenou | 47 | 0.24 |
| Sant Andreu de Palomar | 44 | 0.22 |
| Sant Martí de Provençals | 39 | 0.20 |
| La Sagrera | 35 | 0.18 |
| Sarrià | 33 | 0.17 |
| El Congrés i els Indians | 28 | 0.14 |
| Can Baro | 27 | 0.14 |
| Vilapicina i la Torre Llobeta | 26 | 0.13 |
| La Verneda i La Pau | 23 | 0.12 |
| Sant Gervasi - la Bonanova | 21 | 0.11 |
| Les Tres Torres | 18 | 0.09 |
| El Coll | 16 | 0.08 |
| Sant Genís dels Agudells | 15 | 0.08 |
| Horta | 14 | 0.07 |
| La Teixonera | 13 | 0.07 |
| Turó de la Peira - Can Peguera | 13 | 0.07 |
| Pedralbes | 12 | 0.06 |
| La Font d’en Fargues | 11 | 0.06 |
| Montbau | 11 | 0.06 |
| 10 | 0.05 | |
| Porta | 10 | 0.05 |
| El Bon Pastor | 9 | 0.05 |
| Verdum - Los Roquetes | 9 | 0.05 |
| La Prosperitat | 7 | 0.04 |
| La Trinitat Vella | 7 | 0.04 |
| La Guineueta - Canyelles | 5 | 0.03 |
| La Vall d’Hebron | 5 | 0.03 |
| Trinitat Nova | 5 | 0.03 |
| Torre Baró | 1 | 0.01 |
# Ciudad
ciudades <- data %>%
count(city, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2))
kable(ciudades)
| city | n | pct |
|---|---|---|
| Barcelona | 19236 | 96.99 |
| BARCELONA | 348 | 1.75 |
| L’Hospitalet de Llobregat | 67 | 0.34 |
| Barcelona | 28 | 0.14 |
| barcelona | 27 | 0.14 |
| . | 16 | 0.08 |
| Барселона | 15 | 0.08 |
| Barcelone | 13 | 0.07 |
| Sant Adrià de Besòs | 13 | 0.07 |
| Eixample | 9 | 0.05 |
| Hospitalet de Llobregat | 9 | 0.05 |
| Les Corts | 7 | 0.04 |
| Barcelona, Catalunya, ES | 5 | 0.03 |
| 4 | 0.02 | |
| 83-93 | 4 | 0.02 |
| * | 3 | 0.02 |
| Sant Cugat del Vallès | 3 | 0.02 |
| Gracia-Barcelona | 2 | 0.01 |
| Sant Adria de Besos | 2 | 0.01 |
| St Cugat del Vallès | 2 | 0.01 |
| 巴塞罗那 | 2 | 0.01 |
| 03-Jan | 1 | 0.01 |
| 08028, Barcelona | 1 | 0.01 |
| Badalona | 1 | 0.01 |
| Barcelon | 1 | 0.01 |
| Barcelona El RAVAL | 1 | 0.01 |
| Barcelonaneta | 1 | 0.01 |
| Bcn | 1 | 0.01 |
| Hospitalet | 1 | 0.01 |
| Hospitalet de Llobregat | 1 | 0.01 |
| Maragall- Hospital de Sant Pau | 1 | 0.01 |
| SANTS | 1 | 0.01 |
| Sagrada Familia, Barcelona | 1 | 0.01 |
| Sant Adrià de Besos | 1 | 0.01 |
| Sant adria de besos | 1 | 0.01 |
| Sants-Les Corts | 1 | 0.01 |
| Zona Forum | 1 | 0.01 |
| barcelona | 1 | 0.01 |
| 巴塞罗纳 | 1 | 0.01 |
# Crear columna limpia
data_clean <- data_clean %>%
mutate(city = str_to_lower(city), # minúsculas
city = str_squish(city), # quitar espacios extra
city = str_replace_all(city, "[^\\w\\s]", ""), # quitar símbolos raros
city_clean = case_when(
str_detect(city, "barcelona") ~ "Barcelona",
str_detect(city, "hospitalet") ~ "L'Hospitalet",
str_detect(city, "sant adri") ~ "Sant Adrià",
TRUE ~ "Otro"
))
# Ver resumen final
data_clean %>%
count(city_clean, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2)) %>%
knitr::kable(caption = "Distribución de city_clean")
| city_clean | n | pct |
|---|---|---|
| Barcelona | 19634 | 99.08 |
| Otro | 87 | 0.44 |
| L’Hospitalet | 78 | 0.39 |
| Sant Adrià | 17 | 0.09 |
data_clean <- data_clean %>%
filter(!city_clean %in% c("L’Hospitalet", "Sant Adrià", "L'Hospitalet","st cugat del valles", "maragall hospital de sant pau", "st cugat del valles"))
otros_ciudad <- data_clean %>%
filter(city_clean == "Otro") %>%
count(city, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2))
ggplot(otros_ciudad, aes(x = reorder(city, n), y = n)) +
geom_col(fill = "darkorange") +
coord_flip() +
labs(title = "Distribución de valores originales agrupados como 'Otro'",
x = "Valor original en 'city'", y = "Frecuencia")
# Definir variantes comunes de 'Barcelona' (incluyendo traducciones y errores frecuentes)
variantes_barcelona <- c("barcelone", "barcelona", "bcn", "barcelon", "巴塞罗那", "барселона", "巴塞罗納")
# Corregir valores
data_clean <- data_clean %>%
mutate(city_clean = case_when(
city %in% variantes_barcelona ~ "Barcelona",
city %in% c("eixample", "les corts", "sants", "santsles corts", "zona forum") ~ "Barcelona", # distritos
city %in% c("", "8393", "03jan", "faltante", NA) ~ "Faltante",
TRUE ~ city_clean
))
# Eliminar filas que queden como 'Faltante'
data_clean <- data_clean %>% filter(city_clean != "Faltante")
data_clean %>%
count(city_clean, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2)) %>%
knitr::kable(caption = "Distribución de city_clean")
| city_clean | n | pct |
|---|---|---|
| Barcelona | 19685 | 99.96 |
| Otro | 8 | 0.04 |
otros_ciudad <- data_clean %>%
filter(city_clean == "Otro") %>%
count(city, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2))
ggplot(otros_ciudad, aes(x = reorder(city, n), y = n)) +
geom_col(fill = "darkorange") +
coord_flip() +
labs(title = "Distribución de valores originales agrupados como 'Otro'",
x = "Valor original en 'city'", y = "Frecuencia")
# Asignar el valor en chino "巴塞罗那" y "maragall hospital de sant pau" como "Barcelona"
data_clean <- data_clean %>%
mutate(city_clean = case_when(
city_clean == "巴塞罗那" ~ "Barcelona", # Asigna el valor en chino a "Barcelona"
city_clean == "maragall hospital de sant pau" ~ "Barcelona", # Asigna "maragall hospital de sant pau" a "Barcelona"
TRUE ~ city_clean)) %>% # Deja los demás valores como están
filter(!(city_clean == "Otro")) # Eliminar todos los valores "Otro"
neigh_summary <- data_clean %>%
count(neighbourhood, sort = TRUE) %>%
mutate(pct = 100 * n / sum(n)) %>%
filter(pct < 0.1)
kable(neigh_summary, caption = "Barrios con menos del 0.1% de representación")
| neighbourhood | n | pct |
|---|---|---|
| Les Tres Torres | 18 | 0.0914402 |
| El Coll | 16 | 0.0812802 |
| Sant Genís dels Agudells | 15 | 0.0762002 |
| Horta | 14 | 0.0711201 |
| La Teixonera | 13 | 0.0660401 |
| Turó de la Peira - Can Peguera | 13 | 0.0660401 |
| Pedralbes | 12 | 0.0609601 |
| Montbau | 11 | 0.0558801 |
| La Font d’en Fargues | 10 | 0.0508001 |
| Porta | 10 | 0.0508001 |
| El Bon Pastor | 9 | 0.0457201 |
| Verdum - Los Roquetes | 9 | 0.0457201 |
| La Prosperitat | 7 | 0.0355601 |
| La Trinitat Vella | 7 | 0.0355601 |
| Faltante | 6 | 0.0304801 |
| La Guineueta - Canyelles | 5 | 0.0254001 |
| La Vall d’Hebron | 5 | 0.0254001 |
| Trinitat Nova | 5 | 0.0254001 |
| Torre Baró | 1 | 0.0050800 |
data_clean %>%
count(neighbourhood, sort = TRUE) %>%
slice_max(n, n = 20) %>%
ggplot(aes(x = reorder(neighbourhood, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 barrios con más alojamientos",
x = "Barrio", y = "Frecuencia")
# Imputar valores faltantes en 'neighbourhood' con el valor de 'distrito'
data_clean <- data_clean %>%
mutate(neighbourhood = ifelse(
is.na(neighbourhood) | neighbourhood == "",
distrito,
neighbourhood
))
# Distribución por distritos
distritos <- data %>%
count(distrito, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2))
kable(distritos)
| distrito | n | pct |
|---|---|---|
| Eixample | 6597 | 33.26 |
| Ciutat Vella | 4561 | 23.00 |
| Sants-Montjuïc | 2267 | 11.43 |
| Sant Martí | 2182 | 11.00 |
| Gràcia | 1714 | 8.64 |
| Sarrià-Sant Gervasi | 781 | 3.94 |
| Horta-Guinardó | 640 | 3.23 |
| Les Corts | 414 | 2.09 |
| Sant Andreu | 331 | 1.67 |
| Nou Barris | 241 | 1.22 |
| Barcelona | 44 | 0.22 |
| L’Hospitalet de Llobregat | 39 | 0.20 |
| San Andrés de Palomar | 15 | 0.08 |
| Sant Adrià de Besòs | 4 | 0.02 |
| 2 | 0.01 | |
| Santa Coloma de Gramenet | 1 | 0.01 |
distritos_fuera <- c(
"Barcelona",
"L’Hospitalet de Llobregat",
"L’Hospitalet de Llobregat",
"San Andrés de Palomar",
"Sant Adrià de Besòs",
"Santa Coloma de Gramenet"
)
# Eliminar esos registros (sin modificar el resto)
data_clean <- data_clean %>%
filter(!(distrito %in% distritos_fuera))
# Distribución por distritos
distritos <- data_clean %>%
count(distrito, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2))
kable(distritos)
| distrito | n | pct |
|---|---|---|
| Eixample | 6574 | 33.50 |
| Ciutat Vella | 4556 | 23.21 |
| Sants-Montjuïc | 2240 | 11.41 |
| Sant Martí | 2164 | 11.03 |
| Gràcia | 1710 | 8.71 |
| Sarrià-Sant Gervasi | 775 | 3.95 |
| Horta-Guinardó | 636 | 3.24 |
| Les Corts | 397 | 2.02 |
| Sant Andreu | 331 | 1.69 |
| Nou Barris | 241 | 1.23 |
| Faltante | 1 | 0.01 |
| L’Hospitalet de Llobregat | 1 | 0.01 |
faltantes <- data_clean %>% filter(distrito == "Faltante")
hospitalet <- data_clean %>% filter(distrito == "L'Hospitalet de Llobregat")
data_clean <- data_clean %>%
filter(!(id %in% c(15124886, 36413411))) # Reemplaza con los ID reales
# Distribución por distritos
distritos <- data_clean %>%
count(distrito, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2))
kable(distritos)
| distrito | n | pct |
|---|---|---|
| Eixample | 6574 | 33.50 |
| Ciutat Vella | 4556 | 23.22 |
| Sants-Montjuïc | 2240 | 11.41 |
| Sant Martí | 2164 | 11.03 |
| Gràcia | 1710 | 8.71 |
| Sarrià-Sant Gervasi | 775 | 3.95 |
| Horta-Guinardó | 636 | 3.24 |
| Les Corts | 397 | 2.02 |
| Sant Andreu | 331 | 1.69 |
| Nou Barris | 241 | 1.23 |
| L’Hospitalet de Llobregat | 1 | 0.01 |
unique(data_clean$distrito)
## [1] "Sant Martí" "Eixample"
## [3] "Gràcia" "Horta-Guinardó"
## [5] "Les Corts" "Ciutat Vella"
## [7] "Sants-Montjuïc" "Sarrià-Sant Gervasi"
## [9] "Nou Barris" "Sant Andreu"
## [11] "L'Hospitalet de Llobregat"
data_clean <- data_clean %>%
filter(distrito != "L'Hospitalet de Llobregat")
# Distribución por distritos
distritos <- data_clean %>%
count(distrito, sort = TRUE) %>%
mutate(pct = round(100 * n / sum(n), 2))
kable(distritos)
| distrito | n | pct |
|---|---|---|
| Eixample | 6574 | 33.50 |
| Ciutat Vella | 4556 | 23.22 |
| Sants-Montjuïc | 2240 | 11.41 |
| Sant Martí | 2164 | 11.03 |
| Gràcia | 1710 | 8.71 |
| Sarrià-Sant Gervasi | 775 | 3.95 |
| Horta-Guinardó | 636 | 3.24 |
| Les Corts | 397 | 2.02 |
| Sant Andreu | 331 | 1.69 |
| Nou Barris | 241 | 1.23 |
head(data_clean$price)
## [1] "$130.00" "$60.00" "$33.00" "$45.00" "$42.00" "$53.00"
# Eliminar símbolo de dólar y comas, luego convertir a numérico
data_clean$price <- as.numeric(gsub("[$,]", "", data_clean$price))
ggplot(data_clean, aes(x = price)) +
geom_histogram(bins = 50, fill = "steelblue") +
scale_x_log10() +
labs(title = "Distribución de precios (log)", x = "Precio (log)", y = "Frecuencia")
ggplot(data_clean, aes(x = price)) +
geom_histogram(aes(y = ..density..), bins = 50, fill = "steelblue", alpha = 0.6) +
geom_density(color = "darkred", size = 1) +
scale_x_log10() +
labs(title = "Distribución de precios (log)", x = "Precio", y = "Densidad")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Filtrar precios mayores a 1000 por ejemplo
ggplot(data_clean %>% filter(price < 1000), aes(x = price)) +
geom_histogram(bins = 50, fill = "darkcyan") +
labs(title = "Precios filtrados (< 1000€)", x = "Precio", y = "Frecuencia")
distritos_fuera <- c(
"Barcelona",
"L’Hospitalet de Llobregat",
"San Andrés de Palomar",
"Sant Adrià de Besòs",
"Santa Coloma de Gramenet"
)
# Eliminar esos registros (sin modificar el resto)
data_clean <- data_clean %>%
filter(!(distrito %in% distritos_fuera))
property_type_summary <- data %>%
count(property_type, name = "count") %>%
arrange(desc(count))
kable(property_type_summary, caption = "Frequency of Each Property Type")
| property_type | count |
|---|---|
| Apartment | 16843 |
| Serviced apartment | 688 |
| Loft | 466 |
| House | 380 |
| Condominium | 361 |
| Bed and breakfast | 221 |
| Hostel | 182 |
| Guest suite | 168 |
| Boutique hotel | 123 |
| Other | 69 |
| Boat | 52 |
| Hotel | 52 |
| Aparthotel | 44 |
| Casa particular (Cuba) | 39 |
| Guesthouse | 36 |
| Townhouse | 36 |
| Villa | 24 |
| Nature lodge | 8 |
| Tiny house | 8 |
| Barn | 7 |
| Camper/RV | 7 |
| Dome house | 6 |
| Chalet | 5 |
| Earth house | 3 |
| Farm stay | 2 |
| Cabin | 1 |
| Cottage | 1 |
| Dorm | 1 |
# Tipo de propiedad
ggplot(data_clean, aes(x = property_type)) +
geom_bar(fill = "darkgreen") +
coord_flip() +
labs(title = "Tipos de propiedad", x = "", y = "Frecuencia")
data <- data %>%
mutate(accommodation_type = case_when(
property_type %in% c("Apartment", "Serviced apartment", "Loft", "House", "Condominium",
"Villa", "Townhouse", "Tiny house", "Aparthotel", "Guest suite",
"Chalet", "Cabin", "Cottage", "Dome house", "Nature lodge",
"Farm stay", "Boat", "Camper/RV") ~ "Entire place",
property_type %in% c("Private room", "Casa particular (Cuba)", "Barn", "Earth house",
"Earthen home", "Yurt", "Hut", "Dorm") ~ "Private room",
property_type %in% c("Shared room", "Room in", "Hostel") ~ "Shared room",
TRUE ~ "Other"
))
data$accommodation_type[data$property_type == "Entire guest suite"] <- "Private room"
data$accommodation_type[data$property_type %in% c("Casa particular", "Earthen home", "Boat", "Camper/RV", "Tiny home")] <- "Entire place"
cross_tab <- data %>%
count(accommodation_type, property_type) %>%
arrange(accommodation_type, desc(n))
pander(cross_tab)
| accommodation_type | property_type | n |
|---|---|---|
| Entire place | Apartment | 16843 |
| Entire place | Serviced apartment | 688 |
| Entire place | Loft | 466 |
| Entire place | House | 380 |
| Entire place | Condominium | 361 |
| Entire place | Guest suite | 168 |
| Entire place | Boat | 52 |
| Entire place | Aparthotel | 44 |
| Entire place | Townhouse | 36 |
| Entire place | Villa | 24 |
| Entire place | Nature lodge | 8 |
| Entire place | Tiny house | 8 |
| Entire place | Camper/RV | 7 |
| Entire place | Dome house | 6 |
| Entire place | Chalet | 5 |
| Entire place | Farm stay | 2 |
| Entire place | Cabin | 1 |
| Entire place | Cottage | 1 |
| Other | Bed and breakfast | 221 |
| Other | Boutique hotel | 123 |
| Other | Other | 69 |
| Other | Hotel | 52 |
| Other | Guesthouse | 36 |
| Private room | Casa particular (Cuba) | 39 |
| Private room | Barn | 7 |
| Private room | Earth house | 3 |
| Private room | Dorm | 1 |
| Shared room | Hostel | 182 |
ggplot(data, aes(x = number_of_reviews)) +
geom_histogram(bins = 50, fill = "purple", color = "white") +
labs(title = "Distribución del número de reseñas por alojamiento",
x = "Número de reseñas", y = "Frecuencia")
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.3
gg_miss_var(data_clean[, variables_numericas], show_pct = TRUE) +
labs(title = "Porcentaje de valores faltantes por variable numérica",
x = "Variable", y = "% faltante") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pct_na <- data_clean[, variables_numericas] %>%
summarise(across(everything(), ~ mean(is.na(.)) * 100)) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "pct_na") %>%
filter(pct_na > 0)
# Gráfico solo con variables que tienen NA
ggplot(pct_na, aes(x = reorder(variable, pct_na), y = pct_na)) +
geom_col(fill = "firebrick") +
coord_flip() +
labs(title = "% de valores faltantes por variable numérica (solo > 0%)",
x = "Variable", y = "% de NA") +
theme_minimal()
review_vars <- c("review_scores_rating", "review_scores_accuracy", "review_scores_checkin",
"review_scores_cleanliness", "review_scores_communication",
"review_scores_location", "review_scores_value", "reviews_per_month")
data_clean <- data_clean %>%
group_by(distrito) %>%
mutate(across(all_of(review_vars), ~ ifelse(is.na(.), median(., na.rm = TRUE), .))) %>%
ungroup()
variables_numericas <- names(data_clean)[sapply(data_clean, is.numeric)]
faltantes_num <- data_clean %>%
summarise(across(all_of(variables_numericas), ~ mean(is.na(.)) * 100)) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "pct_na") %>%
filter(pct_na > 0) %>%
arrange(desc(pct_na))
ggplot(faltantes_num, aes(x = reorder(variable, pct_na), y = pct_na)) +
geom_col(fill = "firebrick") +
coord_flip() +
labs(title = "Variables numéricas con NA restantes",
x = "Variable", y = "% de NA") +
theme_minimal()
vars_bajas <- c("host_listings_count", "beds", "bathrooms", "bedrooms")
data_clean <- data_clean %>%
group_by(distrito) %>%
mutate(across(all_of(vars_bajas), ~ ifelse(is.na(.), median(., na.rm = TRUE), .))) %>%
ungroup()
# Verificar cuántos valores NA quedan en total
total_na <- sum(is.na(data_clean))
cat("🔍 Total de valores NA restantes en el dataset:", total_na)
## 🔍 Total de valores NA restantes en el dataset: 0
# Cargar los datos de transporte
bus_stops <- read.csv("bus_stops.csv") # Ajusta la ruta si es necesario
transports <- read.csv("transports.csv") # Ajusta la ruta si es necesario
# Verificar las primeras filas de los datasets
head(bus_stops)
## Code Transport Longitude Latitude Bus.Stop District.Name
## 1 K014 Day bus stop 2.171619 41.41374 BUS -192-- Horta-Guinardó
## 2 K014 Day bus stop 2.134902 41.42022 BUS -124-- Gràcia
## 3 K014 Day bus stop 2.162913 41.42319 BUS -117-- Horta-Guinardó
## 4 K014 Day bus stop 2.163667 41.42290 BUS -117-- Horta-Guinardó
## 5 K014 Day bus stop 2.120212 41.39721 BUS -130-- Sarrià-Sant Gervasi
## 6 K014 Day bus stop 2.138941 41.41108 BUS -131-- Sarrià-Sant Gervasi
## Neighborhood.Name
## 1 el Guinardó
## 2 Vallcarca i els Penitents
## 3 la Font d'en Fargues
## 4 la Font d'en Fargues
## 5 Sarrià
## 6 el Putxet i el Farró
head(transports)
## Code Transport Longitude Latitude
## 1 K001 Underground 2.119370 41.39920
## 2 K001 Underground 2.135427 41.39779
## 3 K001 Underground 2.185391 41.45149
## 4 K001 Underground 2.174473 41.46089
## 5 K001 Underground 2.168588 41.38720
## 6 K001 Underground 2.168507 41.38771
## Station District.Name
## 1 FGC (L6) - REINA ELISENDA (Sortida Duquesa d'Orleans)- Sarrià-Sant Gervasi
## 2 FGC (L6) - LA BONANOVA- Sarrià-Sant Gervasi
## 3 METRO (L11) - CASA DE L'AIGUA (C. Vila-Real)- Nou Barris
## 4 METRO (L11) - CIUTAT MERIDIANA (C. Pedraforca)- Nou Barris
## 5 METRO (L1) - CATALUNYA (Rda. Universitat)- Eixample
## 6 METRO (L1) - CATALUNYA (Rambla de Catalunya)- Eixample
## Neighborhood.Name
## 1 Sarrià
## 2 Sant Gervasi - Galvany
## 3 la Trinitat Nova
## 4 Ciutat Meridiana
## 5 la Dreta de l'Eixample
## 6 la Dreta de l'Eixample
# Contar las paradas de bus por distrito
library(tidyverse)
paradas_por_distrito_bus <- bus_stops %>%
group_by(District.Name) %>%
summarise(num_paradas_bus = n())
# Contar los transportes por distrito
paradas_por_distrito_transports <- transports %>%
group_by(District.Name) %>%
summarise(num_paradas_transport = n())
# Unir ambos dataframes por el nombre del distrito
paradas_por_distrito <- left_join(paradas_por_distrito_bus, paradas_por_distrito_transports, by = "District.Name")
# Ver el resultado final
paradas_por_distrito
## # A tibble: 11 × 3
## District.Name num_paradas_bus num_paradas_transport
## <chr> <int> <int>
## 1 "" 16 164
## 2 "Ciutat Vella" 167 27
## 3 "Eixample" 405 120
## 4 "Gràcia" 210 17
## 5 "Horta-Guinardó" 389 31
## 6 "Les Corts" 207 24
## 7 "Nou Barris" 330 39
## 8 "Sant Andreu" 228 40
## 9 "Sant Martí" 356 70
## 10 "Sants-Montjuïc" 415 53
## 11 "Sarrià-Sant Gervasi" 439 66
# Eliminar las filas que tienen valor vacío en 'District.Name'
paradas_por_distrito <- paradas_por_distrito %>%
filter(!is.na(District.Name) & District.Name != "")
# Ver el resultado
paradas_por_distrito
## # A tibble: 10 × 3
## District.Name num_paradas_bus num_paradas_transport
## <chr> <int> <int>
## 1 Ciutat Vella 167 27
## 2 Eixample 405 120
## 3 Gràcia 210 17
## 4 Horta-Guinardó 389 31
## 5 Les Corts 207 24
## 6 Nou Barris 330 39
## 7 Sant Andreu 228 40
## 8 Sant Martí 356 70
## 9 Sants-Montjuïc 415 53
## 10 Sarrià-Sant Gervasi 439 66
paradas_por_distrito <- paradas_por_distrito %>%
mutate(total_transporte_publico = num_paradas_bus + num_paradas_transport)
# Ver el resultado
head(paradas_por_distrito)
## # A tibble: 6 × 4
## District.Name num_paradas_bus num_paradas_transport total_transporte_publico
## <chr> <int> <int> <int>
## 1 Ciutat Vella 167 27 194
## 2 Eixample 405 120 525
## 3 Gràcia 210 17 227
## 4 Horta-Guinardó 389 31 420
## 5 Les Corts 207 24 231
## 6 Nou Barris 330 39 369
# Unir la información de las paradas de bus con los apartamentos por distrito
# Asumimos que 'data_clean' tiene la variable 'distrito' que hace referencia al distrito del apartamento
data_clean <- left_join(data_clean, paradas_por_distrito, by = c("distrito" = "District.Name"))
# Ahora puedes analizar los datos de accesibilidad por distrito
# Por ejemplo, veamos cuántas paradas de bus hay en cada apartamento
head(data_clean)
## # A tibble: 6 × 66
## id listing_url name summary space description neighborhood_overview
## <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 18666 https://www.airbn… Flat… "Apart… Nice… "Apartment… Apartment in Barcelo…
## 2 18674 https://www.airbn… Huge… "110m2… Apar… "110m2 apa… Apartment in Barcelo…
## 3 21605 https://www.airbn… Nice… "The f… L'ap… "The flat … Poblenou as one of t…
## 4 25786 https://www.airbn… NICE… "JUST … Room… "JUST GO T… Solo decir que a men…
## 5 31377 https://www.airbn… Room… "The r… Grea… "The room … Faltante
## 6 31380 https://www.airbn… Room… "Room … Grea… "Room for … Faltante
## # ℹ 59 more variables: picture_url <chr>, host_id <int>, host_url <chr>,
## # host_name <chr>, host_since <chr>, host_response_time <chr>,
## # host_response_rate <chr>, host_is_superhost <chr>, host_picture_url <chr>,
## # host_neighbourhood <chr>, host_listings_count <dbl>,
## # host_verifications <chr>, host_has_profile_pic <chr>,
## # host_identity_verified <chr>, street <chr>, neighbourhood <chr>,
## # neighbourhood_cleansed <chr>, neighbourhood_group_cleansed <chr>, …
data_clean <- data_clean %>%
select(-contains(".y"))
names(data_clean)
## [1] "id" "listing_url"
## [3] "name" "summary"
## [5] "space" "description"
## [7] "neighborhood_overview" "picture_url"
## [9] "host_id" "host_url"
## [11] "host_name" "host_since"
## [13] "host_response_time" "host_response_rate"
## [15] "host_is_superhost" "host_picture_url"
## [17] "host_neighbourhood" "host_listings_count"
## [19] "host_verifications" "host_has_profile_pic"
## [21] "host_identity_verified" "street"
## [23] "neighbourhood" "neighbourhood_cleansed"
## [25] "neighbourhood_group_cleansed" "city"
## [27] "zipcode" "country"
## [29] "latitude" "longitude"
## [31] "is_location_exact" "property_type"
## [33] "room_type" "accommodates"
## [35] "bathrooms" "bedrooms"
## [37] "beds" "amenities"
## [39] "price" "cleaning_fee"
## [41] "minimum_nights" "maximum_nights"
## [43] "has_availability" "availability_30"
## [45] "availability_60" "availability_90"
## [47] "availability_365" "number_of_reviews"
## [49] "number_of_reviews_ltm" "first_review"
## [51] "last_review" "review_scores_rating"
## [53] "review_scores_accuracy" "review_scores_cleanliness"
## [55] "review_scores_checkin" "review_scores_communication"
## [57] "review_scores_location" "review_scores_value"
## [59] "instant_bookable" "reviews_per_month"
## [61] "distrito" "criminalidad_distrito"
## [63] "city_clean" "num_paradas_bus"
## [65] "num_paradas_transport" "total_transporte_publico"
head(data_clean)
## # A tibble: 6 × 66
## id listing_url name summary space description neighborhood_overview
## <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 18666 https://www.airbn… Flat… "Apart… Nice… "Apartment… Apartment in Barcelo…
## 2 18674 https://www.airbn… Huge… "110m2… Apar… "110m2 apa… Apartment in Barcelo…
## 3 21605 https://www.airbn… Nice… "The f… L'ap… "The flat … Poblenou as one of t…
## 4 25786 https://www.airbn… NICE… "JUST … Room… "JUST GO T… Solo decir que a men…
## 5 31377 https://www.airbn… Room… "The r… Grea… "The room … Faltante
## 6 31380 https://www.airbn… Room… "Room … Grea… "Room for … Faltante
## # ℹ 59 more variables: picture_url <chr>, host_id <int>, host_url <chr>,
## # host_name <chr>, host_since <chr>, host_response_time <chr>,
## # host_response_rate <chr>, host_is_superhost <chr>, host_picture_url <chr>,
## # host_neighbourhood <chr>, host_listings_count <dbl>,
## # host_verifications <chr>, host_has_profile_pic <chr>,
## # host_identity_verified <chr>, street <chr>, neighbourhood <chr>,
## # neighbourhood_cleansed <chr>, neighbourhood_group_cleansed <chr>, …
centros_medicos <- read.csv("centros_medicos.csv", sep = ";", stringsAsFactors = FALSE, fileEncoding = "Latin1")
centros_medicos_clean <- centros_medicos[!is.na(centros_medicos$addresses_district_name) & centros_medicos$addresses_district_name != "", ]
recuento_centros_salud <- centros_medicos_clean %>%
group_by(addresses_district_name) %>%
summarise(total_centros_salud_distrito = n())
data_clean <- left_join(data_clean, recuento_centros_salud, by = c("distrito" = "addresses_district_name"))
head(data_clean)
## # A tibble: 6 × 67
## id listing_url name summary space description neighborhood_overview
## <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 18666 https://www.airbn… Flat… "Apart… Nice… "Apartment… Apartment in Barcelo…
## 2 18674 https://www.airbn… Huge… "110m2… Apar… "110m2 apa… Apartment in Barcelo…
## 3 21605 https://www.airbn… Nice… "The f… L'ap… "The flat … Poblenou as one of t…
## 4 25786 https://www.airbn… NICE… "JUST … Room… "JUST GO T… Solo decir que a men…
## 5 31377 https://www.airbn… Room… "The r… Grea… "The room … Faltante
## 6 31380 https://www.airbn… Room… "Room … Grea… "Room for … Faltante
## # ℹ 60 more variables: picture_url <chr>, host_id <int>, host_url <chr>,
## # host_name <chr>, host_since <chr>, host_response_time <chr>,
## # host_response_rate <chr>, host_is_superhost <chr>, host_picture_url <chr>,
## # host_neighbourhood <chr>, host_listings_count <dbl>,
## # host_verifications <chr>, host_has_profile_pic <chr>,
## # host_identity_verified <chr>, street <chr>, neighbourhood <chr>,
## # neighbourhood_cleansed <chr>, neighbourhood_group_cleansed <chr>, …
parques_jardines <- read.csv("parques_jardines.csv", sep = ";", stringsAsFactors = FALSE, fileEncoding = "Latin1")
parques_jardines_clean <- parques_jardines[!is.na(parques_jardines$addresses_district_name) & parques_jardines$addresses_district_name != "", ]
recuento_parques_jardines <- parques_jardines_clean %>%
group_by(addresses_district_name) %>%
summarise(total_parques_jardines_distrito = n())
data_clean <- left_join(data_clean, recuento_parques_jardines, by = c("distrito" = "addresses_district_name"))
head(data_clean)
## # A tibble: 6 × 68
## id listing_url name summary space description neighborhood_overview
## <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 18666 https://www.airbn… Flat… "Apart… Nice… "Apartment… Apartment in Barcelo…
## 2 18674 https://www.airbn… Huge… "110m2… Apar… "110m2 apa… Apartment in Barcelo…
## 3 21605 https://www.airbn… Nice… "The f… L'ap… "The flat … Poblenou as one of t…
## 4 25786 https://www.airbn… NICE… "JUST … Room… "JUST GO T… Solo decir que a men…
## 5 31377 https://www.airbn… Room… "The r… Grea… "The room … Faltante
## 6 31380 https://www.airbn… Room… "Room … Grea… "Room for … Faltante
## # ℹ 61 more variables: picture_url <chr>, host_id <int>, host_url <chr>,
## # host_name <chr>, host_since <chr>, host_response_time <chr>,
## # host_response_rate <chr>, host_is_superhost <chr>, host_picture_url <chr>,
## # host_neighbourhood <chr>, host_listings_count <dbl>,
## # host_verifications <chr>, host_has_profile_pic <chr>,
## # host_identity_verified <chr>, street <chr>, neighbourhood <chr>,
## # neighbourhood_cleansed <chr>, neighbourhood_group_cleansed <chr>, …
SACAR LAS DISTANCIAS A LOS PUNTOS IMPORTANTES
library(geosphere)
## Warning: package 'geosphere' was built under R version 4.4.3
destinos <- data.frame(
name = c("Sagrada_Familia", "Las_Ramblas", "Barceloneta", "Casa_Batllo"),
lon = c(2.1743128817274022,
2.169795139397786,
2.193627948808803,
2.164892558555271),
lat = c(41.40376667643144,
41.385640869698186,
41.38045855785224,
41.391774395024854),
stringsAsFactors = FALSE
)
coords_origen <- as.matrix(data_clean[, c("longitude","latitude")])
matriz_km <- sapply(1:nrow(destinos), function(i) {
distHaversine(coords_origen,
c(destinos$lon[i], destinos$lat[i])) / 1000
})
colnames(matriz_km) <- destinos$name
data_clean <- cbind(
data_clean,
as.data.frame(matriz_km)
)
head(data)
## id listing_url
## 1 18666 https://www.airbnb.com/rooms/18666
## 2 18674 https://www.airbnb.com/rooms/18674
## 3 21605 https://www.airbnb.com/rooms/21605
## 4 23197 https://www.airbnb.com/rooms/23197
## 5 25786 https://www.airbnb.com/rooms/25786
## 6 31377 https://www.airbnb.com/rooms/31377
## name
## 1 Flat with Sunny Terrace
## 2 Huge flat for 8 people close to Sagrada Familia
## 3 Nice and sunny duble room
## 4 FORUM DELUXE 5 MINS WALK CCIB CENTER & SEA!
## 5 NICE ROOM AVAILABLE IN THE HEART OF GRACIA
## 6 Room for 2, Sagrada Famili
## summary
## 1 Apartment located near the "Plaza de las Glorias" and the second-hand market (Encants). The accommodation is also close to the National Theatre of Catalunya and the Agbar Tower which has become one of the new symbols of Barcelona. Licence number: HUTB-(PHONE NUMBER HIDDEN)
## 2 110m2 apartment to rent in Barcelona. Located in the Eixample district, near the Sagrada Familia. It has a small balcony where you can see the temple of Gaudi. Capacity for 8 people. Licence number: HUTB-002062
## 3 The flat is in Poblenou district, and the room is a double room with a double bed, a wardrobe, a table, TV, wifi, heating and wood floor. Beautiful and charmy.
## 4 I do not accept groups of young people under 25, the apartment is not suitable for you, ideal for families and quiet people. Beautiful apartment, large terrace, 5 min walk CCIB center, sea, Port Forum. Great location for combining business with pleasure. After a long day at a conference, beach or sightseeing, sit out and relax on the large terrace for a quiet dinner or a nice cold glass of beer or wine away from the bustle and noise of Barcelona and tourist crowds.
## 5 JUST GO THROUGH THE MANY REVIEWS I GOT THROUGH THE YEARS, NO BETTER FEEDBACK THAN THAT. WELCOME.
## 6 The room in 500 m from Sagrada Familia. 3 branches of the subway in 7 minutes of walking. A straight line to the center and the beach. A safe area area with the developed infrastructure. Completely equipped kitchen, a washing machine, an air conditioner. Private bathroom. Wi-fi free.
## space
## 1 Nice apartment situated on the penthouse floor of a building with elevator. Huge Living/dining-room with double sofa-bed 1 bedroom with two single beds 1 bedroom with double bed Nice kitchen opened to the living/dining-room and fully equipped for 6 people Bathroom with shower The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with air-conditioning and heating.
## 2 Apartment with 110 m2 located in the 6th floor in a building with elevator Huge living/dinig-room 1 double bedrrom 1 bedroom with 2 single beds 1 bedroom with bunk beds Kitchen fully equipped for 8 people 1 bathroom with bathtub 1 small bathroom with shower balcony The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with heating, air conditioning and wifi.
## 3 L'apartament està al barri de Poblenou, i l'habitació te un llit doble de (Phone number hidden by Airbnb) armari, una taula i cadira d'estudi, TV, wifi, calefacció i terra de parquet. Bonica i encantadora. A prop de la platja, Glòries, Sagrada Família, museu del disseny, els encants, rambla del Poblenou, torre Agbar, Forum. L'habitació no té clau
## 4 Elegant spacious apartment suitable for 6. Ample lounge/dining area with AC, floor to ceiling sliding glass doors open out to the large balcony with dining table & chairs and wicker sofa. Master bedroom with ensuite bathroom. ceiling fan, built in wardrobes and view of the Tibidabo mountain and children´s play area. Office bedroom has bunk beds, 90x200 cm mattresses, pedestal ventilator, built in wardrobes and view of the Tibidabo mountain and childrens play area. Third bedroom, two single beds. fan, built in wardrobes and view of the Tibidabo mountain and childrens play area. Guest bathroom: walk-in shower, basin, bidet, WC. Fully equipped kitchen with access to the balcony; Refrigerator, freezer, halogen cooker, oven, microwave, Nespresso coffee machine, conventional coffee machine, toaster, electric juicer, kettle, dishwasher... Babies travel cot/high chair., AC only in the lounge, ceiling and pedestal fans in bedrooms. Laundry with washing machine and dryer, ironing equipment, F
## 5 Room available for rent.- PEDRO PEREZ. Shared with a Catalan male aged 38, Ayurvedic massage therapist and Yoga practitioner. Looking for people non-smoking, enthusiastic willing to share more than just the space in a centric beautiful flat in PLaça Vila de Gracia. i am very flexible you can use anything in the house feel free to ask anything! The neighborhood is really special you could live here and not needing anything from outside, such an experience, just 100 years ago was a village in the outskirts of barcelona, we do have our own cultural program throughout the year, very Catalan place. The area is full of bohemians, artisans and modern artists. Most of the area has been taken over by us over the past 10 years making it a mix between the past and the present-future. Metro stations around are: Diagonal L3-L5, Fontana L3, Joanic L4, 10-15 minutes walking to city center Ramblas. Separate Wardrobe room available Kitchen and bathroom shared Bills included available for renti
## 6 Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. Private bathroom. Laundry place and kitchen at your disposition. Wi-fi internet. ХОРОШИЙ РАЙОН С РАЗВИТОЙ ИНФРАСТРУКТУРОЙ. РЯДОМ САГРАДА ФАМИЛИЯ, САН ПАУ, ПАРК ГУЭЛЬ, АВЕНИДА ГАУДИ. ОТ МЕТРО SAN PAU 3 МИН. ОТ ALFONS X 5 МИН - ПРЯМАЯ ВЕТКА ДО ПЛЯЖА. ЕВРОПЕЙСКИЕ УСЛОВИЯ. КОНДИЦИОНЕРЫ. БЕСПЛАТНЫЙ ИНТЕРНЕТ WI-FI ADSL. КОМПЬЮТЕР ПОЛНОСТЬЮ ОБОРУДОВАННАЯ КУХНЯ С ПОСУДОЙ. ЧАЙ, КОФЕ, САХАР - БЕСПЛАТНО. СТИРАЛЬНАЯ МАШИНА, УТЮГ, ОБОГРЕВ И Т.Д. ВЫДАЕМ ПОСТЕЛЬНОЕ БЕЛЬЕ, ПОЛОТЕНЦА, ВКЛЮЧАЯ ПЛЯЖНЫЕ, ФЕН, ШАМПУНЬ, ГЕЛЬ ДЛЯ ДУША.
## description
## 1 Apartment located near the "Plaza de las Glorias" and the second-hand market (Encants). The accommodation is also close to the National Theatre of Catalunya and the Agbar Tower which has become one of the new symbols of Barcelona. Licence number: HUTB-(PHONE NUMBER HIDDEN) Nice apartment situated on the penthouse floor of a building with elevator. Huge Living/dining-room with double sofa-bed 1 bedroom with two single beds 1 bedroom with double bed Nice kitchen opened to the living/dining-room and fully equipped for 6 people Bathroom with shower The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with air-conditioning and heating. Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included. We can provide you all kind of entrance and tickets for monuments and shows in Barcelona in order you avo
## 2 110m2 apartment to rent in Barcelona. Located in the Eixample district, near the Sagrada Familia. It has a small balcony where you can see the temple of Gaudi. Capacity for 8 people. Licence number: HUTB-002062 Apartment with 110 m2 located in the 6th floor in a building with elevator Huge living/dinig-room 1 double bedrrom 1 bedroom with 2 single beds 1 bedroom with bunk beds Kitchen fully equipped for 8 people 1 bathroom with bathtub 1 small bathroom with shower balcony The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with heating, air conditioning and wifi. Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included. We can provide you all kind of entrance and tickets for monuments and shows in Barcelona in order you avoid queues and plan your trip in advance. Also we can organize sh
## 3 The flat is in Poblenou district, and the room is a double room with a double bed, a wardrobe, a table, TV, wifi, heating and wood floor. Beautiful and charmy. L'apartament està al barri de Poblenou, i l'habitació te un llit doble de (Phone number hidden by Airbnb) armari, una taula i cadira d'estudi, TV, wifi, calefacció i terra de parquet. Bonica i encantadora. A prop de la platja, Glòries, Sagrada Família, museu del disseny, els encants, rambla del Poblenou, torre Agbar, Forum. L'habitació no té clau The kitchen is fully equipped and can use the washer and dryer. We also have a beautiful balcony on the apartment. And, of course, you can use the bathroom and the living and dining room. My husband and I will be available in person or by phone/ (Hidden by Airbnb) for any questions you have during your stay. Poblenou as one of the few areas that has grown independently, keeping away from fleeting trends and maintaining its identity. As a result it has become one of the most genuine and
## 4 I do not accept groups of young people under 25, the apartment is not suitable for you, ideal for families and quiet people. Beautiful apartment, large terrace, 5 min walk CCIB center, sea, Port Forum. Great location for combining business with pleasure. After a long day at a conference, beach or sightseeing, sit out and relax on the large terrace for a quiet dinner or a nice cold glass of beer or wine away from the bustle and noise of Barcelona and tourist crowds. Elegant spacious apartment suitable for 6. Ample lounge/dining area with AC, floor to ceiling sliding glass doors open out to the large balcony with dining table & chairs and wicker sofa. Master bedroom with ensuite bathroom. ceiling fan, built in wardrobes and view of the Tibidabo mountain and children´s play area. Office bedroom has bunk beds, 90x200 cm mattresses, pedestal ventilator, built in wardrobes and view of the Tibidabo mountain and childrens play area. Third bedroom, two single beds. fan, built in wardrobes a
## 5 JUST GO THROUGH THE MANY REVIEWS I GOT THROUGH THE YEARS, NO BETTER FEEDBACK THAN THAT. WELCOME. Room available for rent.- PEDRO PEREZ. Shared with a Catalan male aged 38, Ayurvedic massage therapist and Yoga practitioner. Looking for people non-smoking, enthusiastic willing to share more than just the space in a centric beautiful flat in PLaça Vila de Gracia. i am very flexible you can use anything in the house feel free to ask anything! The neighborhood is really special you could live here and not needing anything from outside, such an experience, just 100 years ago was a village in the outskirts of barcelona, we do have our own cultural program throughout the year, very Catalan place. The area is full of bohemians, artisans and modern artists. Most of the area has been taken over by us over the past 10 years making it a mix between the past and the present-future. Metro stations around are: Diagonal L3-L5, Fontana L3, Joanic L4, 10-15 minutes walking to city center Ramblas. S
## 6 The room in 500 m from Sagrada Familia. 3 branches of the subway in 7 minutes of walking. A straight line to the center and the beach. A safe area area with the developed infrastructure. Completely equipped kitchen, a washing machine, an air conditioner. Private bathroom. Wi-fi free. Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. Private bathroom. Laundry place and kitchen at your disposition. Wi-fi internet. ХОРОШИЙ РАЙОН С РАЗВИТОЙ ИНФРАСТРУКТУРОЙ. РЯДОМ САГРАДА ФАМИЛИЯ, САН ПАУ, ПАРК ГУЭЛЬ, АВЕНИДА ГАУДИ. ОТ МЕТРО SAN PAU 3 МИН. ОТ ALFONS X 5 МИН - ПРЯМАЯ ВЕТКА ДО ПЛЯЖА. ЕВРОПЕЙСКИЕ УСЛОВИЯ. КОНДИЦИОНЕРЫ. БЕСПЛАТНЫЙ ИНТЕРНЕТ WI-FI ADSL. КОМПЬЮТЕР ПОЛНОСТЬЮ ОБОРУДОВАННАЯ КУХНЯ С ПОСУДОЙ. ЧАЙ, КОФЕ, САХАР - БЕСПЛАТНО. СТИРАЛЬНАЯ МАШИНА, УТЮГ, ОБОГРЕВ И Т.Д. ВЫДАЕМ ПОСТЕЛЬНОЕ БЕЛЬЕ, ПОЛОТЕНЦА, ВК
## neighborhood_overview
## 1 Apartment in Barcelona near to the Plaza de las Glorias, the old market (Encants), the Agbar Tower one of the new symbols of Barcelona and the Teatre Nacional de Catalunya. All kinds of services in surroundings (shops, supermarkets, restaurants, bars).
## 2 Apartment in Barcelona located in the heart of Eixample district, within only 150 m form the great Sagrada Familia and really near of Gaudí Avenue and the famous Sant Pau Hospital . All kind of services in surroundings (shops, supermarkets, restaurants, bars).
## 3 Poblenou as one of the few areas that has grown independently, keeping away from fleeting trends and maintaining its identity. As a result it has become one of the most genuine and prolific metropolitan scenarios of Barcelona city. In recent years, a series of creative hubs have found their home in Poblenou, cultural and commercial spaces that offer similar innovative proposals, becoming part of the neighbourhood’s future without giving up its industrial past. To the mission of the neighbourhood’s normalization, the work of the entrepreneurs has been added, raising the area’s value and adding it to the map of alternative cultural circuits. See more info in (Website hidden by Airbnb)
## 4 Strategically located in the area of Parc del Fòrum, a spacious area where all kinds of events and events are held. It is an area reclaimed by the sea where you can find: the Esplanade, where fairs, music festivals or large events are held; the Fòrum building, triangular in shape and the undisputed icon of the new architecture of Barcelona; the CCIB-Center de Convencions Internacional de Barcelona,; the Parc dels Auditoris, a large outdoor space in front of the sea; the spectacular photovoltaic plate, inclined and suspended over very peculiar columns, the Fòrum Marina with mega yachts and the Forum safe bathing area with access for wheelchairs ,where you can savor the genuine and Mediterranean character of the city. Great area also for walking, cycling, running..... A few minutes walk to the Diagonal Mar shopping center, frequent transportation takes you to the historic center in about 10 minutes by metro or tram T4 to the Olympic Port, etc.
## 5 Solo decir que a menudo ni salgo del barrio. Muy entretenido con sus gentes y lugares.
## 6
## access
## 1 Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included.
## 2 Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included.
## 3 The kitchen is fully equipped and can use the washer and dryer. We also have a beautiful balcony on the apartment. And, of course, you can use the bathroom and the living and dining room.
## 4 You book the entire apartment for yourselves.
## 5 All access with respect. Kitchen facilities need permission. Feel free to ask. Avoid Noise after midnight and early.morning
## 6
## picture_url
## 1 https://a0.muscache.com/im/pictures/47f88bc6-6561-445a-beec-f8ec4ddc1038.jpg?aki_policy=large
## 2 https://a0.muscache.com/im/pictures/13031453/413cdbfc_original.jpg?aki_policy=large
## 3 https://a0.muscache.com/im/pictures/774ca73d-13f4-4848-83c9-965d8332af8a.jpg?aki_policy=large
## 4 https://a0.muscache.com/im/pictures/738532/806da1bf_original.jpg?aki_policy=large
## 5 https://a0.muscache.com/im/pictures/6619f0c7-844e-40a1-8521-44c19b7a4af2.jpg?aki_policy=large
## 6 https://a0.muscache.com/im/pictures/ac805ead-12f0-4ebe-89b3-53ea9ede132f.jpg?aki_policy=large
## host_id host_url host_name host_since
## 1 71615 https://www.airbnb.com/users/show/71615 Mireia And Maria 19/01/2010
## 2 71615 https://www.airbnb.com/users/show/71615 Mireia And Maria 19/01/2010
## 3 82522 https://www.airbnb.com/users/show/82522 Meritxell 18/02/2010
## 4 90417 https://www.airbnb.com/users/show/90417 Etain (Marnie) 09/03/2010
## 5 108310 https://www.airbnb.com/users/show/108310 Pedro 14/04/2010
## 6 134698 https://www.airbnb.com/users/show/134698 Svetlana 29/05/2010
## host_response_time host_response_rate host_is_superhost
## 1 within an hour 99% f
## 2 within an hour 99% f
## 3 within a few hours 100% f
## 4 within an hour 100% t
## 5 within an hour 100% t
## 6 within an hour 100% f
## host_picture_url
## 1 https://a0.muscache.com/im/users/71615/profile_pic/1426612511/original.jpg?aki_policy=profile_x_medium
## 2 https://a0.muscache.com/im/users/71615/profile_pic/1426612511/original.jpg?aki_policy=profile_x_medium
## 3 https://a0.muscache.com/im/pictures/ece65ffd-a798-4209-b1b0-a51060412b29.jpg?aki_policy=profile_x_medium
## 4 https://a0.muscache.com/im/users/90417/profile_pic/1300298768/original.jpg?aki_policy=profile_x_medium
## 5 https://a0.muscache.com/im/pictures/user/2b13f530-a8dd-4777-93a5-a133ac46b97d.jpg?aki_policy=profile_x_medium
## 6 https://a0.muscache.com/im/users/134698/profile_pic/1334849467/original.jpg?aki_policy=profile_x_medium
## host_neighbourhood host_listings_count
## 1 El Camp de l'Arpa del Clot 45
## 2 El Camp de l'Arpa del Clot 45
## 3 El Poblenou 2
## 4 El Besòs i el Maresme 5
## 5 Vila de Gràcia 1
## 6 El Baix Guinardó 9
## host_verifications
## 1 ['email', 'phone', 'reviews', 'jumio', 'government_id']
## 2 ['email', 'phone', 'reviews', 'jumio', 'government_id']
## 3 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'government_id']
## 4 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']
## 5 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']
## 6 ['email', 'phone', 'reviews']
## host_has_profile_pic host_identity_verified
## 1 t t
## 2 t t
## 3 t t
## 4 t t
## 5 t t
## 6 t f
## street neighbourhood
## 1 Barcelona, CT, Spain Sant Martí
## 2 Barcelona, CT, Spain La Sagrada Família
## 3 Barcelona, Catalunya, Spain Sant Martí
## 4 Sant Adria de Besos, Barcelona, Spain Sant Martí
## 5 Barcelona, Barcelona, Spain Vila de Gràcia
## 6 Barcelona, CT, Spain Horta-Guinardó
## neighbourhood_cleansed neighbourhood_group_cleansed city
## 1 el Camp de l'Arpa del Clot Sant Martí Barcelona
## 2 la Sagrada Família Eixample Barcelona
## 3 el Poblenou Sant Martí Barcelona
## 4 el Besòs i el Maresme Sant Martí Sant Adria de Besos
## 5 la Vila de Gràcia Gràcia Barcelona
## 6 el Baix Guinardó Horta-Guinardó Barcelona
## zipcode country latitude longitude is_location_exact property_type
## 1 8026 Spain 41.40889 2.18555 t Apartment
## 2 8025 Spain 41.40420 2.17306 t Apartment
## 3 8018 Spain 41.40560 2.19821 t Apartment
## 4 8930 Spain 41.41203 2.22114 f Apartment
## 5 8012 Spain 41.40145 2.15645 t Apartment
## 6 8025 Spain 41.41097 2.17070 t Apartment
## room_type accommodates bathrooms bedrooms beds
## 1 Entire home/apt 6 1 2 4
## 2 Entire home/apt 8 2 3 6
## 3 Private room 2 1 1 1
## 4 Entire home/apt 6 2 3 8
## 5 Private room 2 1 1 1
## 6 Private room 2 1 1 2
## amenities
## 1 {TV,Internet,Wifi,"Air conditioning","Wheelchair accessible",Kitchen,Elevator,"Free street parking",Heating,"Family/kid friendly",Washer,Dryer,Essentials,Shampoo,"Hair dryer","Hot water","Host greets you","Paid parking on premises"}
## 2 {TV,Internet,Wifi,"Air conditioning","Wheelchair accessible",Kitchen,Elevator,"Free street parking","Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace",Crib,"Hot water","Host greets you","Paid parking on premises"}
## 3 {TV,Wifi,Kitchen,"Paid parking off premises",Elevator,Heating,"Family/kid friendly",Washer,Dryer,"First aid kit",Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace","Self check-in","Smart lock","Hot water","Bed linens","Extra pillows and blankets",Microwave,"Coffee maker",Refrigerator,Dishwasher,"Dishes and silverware","Cooking basics",Oven,Stove,"Patio or balcony","Luggage dropoff allowed","Cleaning before checkout","No stairs or steps to enter","Wide entrance for guests","Flat path to guest entrance","Well-lit path to entrance","No stairs or steps to enter","No stairs or steps to enter","No stairs or steps to enter","Wide entryway","Paid parking on premises"}
## 4 {TV,Internet,Wifi,"Wheelchair accessible",Kitchen,"Paid parking off premises",Elevator,"Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Dryer,"Smoke detector","Carbon monoxide detector","Fire extinguisher",Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace","High chair",Crib,"Pack ’n Play/travel crib","Hot water","Bed linens",Microwave,"Coffee maker",Refrigerator,Dishwasher,"Dishes and silverware","Cooking basics",Oven,Stove,"Patio or balcony","Luggage dropoff allowed","Long term stays allowed","Wide hallways","No stairs or steps to enter","Wide entrance for guests","Flat path to guest entrance","Well-lit path to entrance","No stairs or steps to enter","Wide entryway","Host greets you","Paid parking on premises"}
## 5 {TV,Wifi,"Air conditioning",Kitchen,"Smoking allowed",Elevator,Heating,"Family/kid friendly",Washer,"Fire extinguisher",Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer","Hot water","Luggage dropoff allowed"}
## 6 {Wifi,"Air conditioning",Kitchen,"Paid parking off premises","Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer",Iron,"translation missing: en.hosting_amenity_50","Bed linens","Extra pillows and blankets",Microwave,"Coffee maker",Refrigerator,"Dishes and silverware","Cooking basics",Oven,Stove,"Luggage dropoff allowed","Host greets you"}
## square_feet price cleaning_fee minimum_nights maximum_nights
## 1 75 $130.00 $42.00 3 730
## 2 NA $60.00 $50.00 1 1125
## 3 108 $33.00 2 1125
## 4 NA $210.00 $80.00 3 1125
## 5 NA $45.00 1 730
## 6 NA $42.00 3 1125
## has_availability availability_30 availability_60 availability_90
## 1 t 0 0 0
## 2 t 3 20 50
## 3 t 4 8 15
## 4 t 11 33 63
## 5 t 8 19 41
## 6 t 5 8 16
## availability_365 number_of_reviews number_of_reviews_ltm first_review
## 1 182 1 0 10/10/2015
## 2 129 15 10 27/05/2013
## 3 15 119 36 08/05/2016
## 4 318 45 16 15/03/2011
## 5 115 241 49 11/08/2010
## 6 211 4 0 20/05/2015
## last_review review_scores_rating review_scores_accuracy
## 1 10/10/2015 80 10
## 2 02/07/2019 87 9
## 3 04/07/2019 90 10
## 4 07/07/2019 95 10
## 5 03/07/2019 95 10
## 6 12/03/2018 95 9
## review_scores_cleanliness review_scores_checkin review_scores_communication
## 1 10 2 10
## 2 9 10 10
## 3 9 10 10
## 4 10 10 10
## 5 10 10 10
## 6 10 10 10
## review_scores_location review_scores_value instant_bookable reviews_per_month
## 1 10 8 f 0.02
## 2 9 8 t 0.20
## 3 9 9 f 3.08
## 4 9 9 t 0.44
## 5 10 9 t 2.22
## 6 9 9 f 0.08
## distrito criminalidad_distrito accommodation_type
## 1 Sant Martí 25408 Entire place
## 2 Eixample 46754 Entire place
## 3 Sant Martí 25408 Entire place
## 4 Sant Martí 25408 Entire place
## 5 Gràcia 8588 Entire place
## 6 Horta-Guinardó 10057 Entire place
names(data_clean)
## [1] "id" "listing_url"
## [3] "name" "summary"
## [5] "space" "description"
## [7] "neighborhood_overview" "picture_url"
## [9] "host_id" "host_url"
## [11] "host_name" "host_since"
## [13] "host_response_time" "host_response_rate"
## [15] "host_is_superhost" "host_picture_url"
## [17] "host_neighbourhood" "host_listings_count"
## [19] "host_verifications" "host_has_profile_pic"
## [21] "host_identity_verified" "street"
## [23] "neighbourhood" "neighbourhood_cleansed"
## [25] "neighbourhood_group_cleansed" "city"
## [27] "zipcode" "country"
## [29] "latitude" "longitude"
## [31] "is_location_exact" "property_type"
## [33] "room_type" "accommodates"
## [35] "bathrooms" "bedrooms"
## [37] "beds" "amenities"
## [39] "price" "cleaning_fee"
## [41] "minimum_nights" "maximum_nights"
## [43] "has_availability" "availability_30"
## [45] "availability_60" "availability_90"
## [47] "availability_365" "number_of_reviews"
## [49] "number_of_reviews_ltm" "first_review"
## [51] "last_review" "review_scores_rating"
## [53] "review_scores_accuracy" "review_scores_cleanliness"
## [55] "review_scores_checkin" "review_scores_communication"
## [57] "review_scores_location" "review_scores_value"
## [59] "instant_bookable" "reviews_per_month"
## [61] "distrito" "criminalidad_distrito"
## [63] "city_clean" "num_paradas_bus"
## [65] "num_paradas_transport" "total_transporte_publico"
## [67] "total_centros_salud_distrito" "total_parques_jardines_distrito"
## [69] "Sagrada_Familia" "Las_Ramblas"
## [71] "Barceloneta" "Casa_Batllo"
OBJETIVO 1
Nos quedaremos unicamente con los aribnb que sean a
unique(data_clean$property_type)
## [1] "Apartment" "Loft" "Bed and breakfast"
## [4] "Serviced apartment" "Condominium" "Villa"
## [7] "Boat" "Other" "House"
## [10] "Aparthotel" "Guesthouse" "Boutique hotel"
## [13] "Townhouse" "Guest suite" "Nature lodge"
## [16] "Hostel" "Tiny house" "Casa particular (Cuba)"
## [19] "Hotel" "Chalet" "Dorm"
## [22] "Camper/RV" "Cabin" "Farm stay"
## [25] "Barn" "Dome house" "Earth house"
## [28] "Cottage"
# Ver valores que contienen "apartment" o "apartamento"
table(tolower(data_clean$property_type))
##
## aparthotel apartment barn
## 44 16691 7
## bed and breakfast boat boutique hotel
## 219 45 104
## cabin camper/rv casa particular (cuba)
## 1 7 39
## chalet condominium cottage
## 4 356 1
## dome house dorm earth house
## 6 1 2
## farm stay guest suite guesthouse
## 2 165 35
## hostel hotel house
## 182 51 371
## loft nature lodge other
## 462 8 69
## serviced apartment tiny house townhouse
## 687 7 35
## villa
## 23
# Filtrar solo tipos de propiedad que son apartamentos
tipos_apartamento <- c("Apartment")
data_clean <- data_clean %>%
filter(property_type %in% tipos_apartamento)
data_clean$room_type_normalizado <- ifelse(data_clean$room_type == "Entire home/apt", 1, 0)
Dentro de estos vamos a ver cuantos tipos de habitacion tenemos y ver si hay algunbo que podamos eliminars (room_type)
cross_tab2 <- data_clean %>%
count(room_type)
pander(cross_tab2)
| room_type | n |
|---|---|
| Entire home/apt | 8097 |
| Private room | 8518 |
| Shared room | 76 |
Vemos que shared room tiene pocos valores por lo que vamo a eliminar todos aquellos apartamentos que en room_type tengan shared room
room_validos <- c("Entire home/apt", "Private room")
data_clean <- data_clean %>%
filter(room_type %in% room_validos)
data_clean <- data_clean %>%
mutate(n_amenities = str_count(amenities, ",") + 1)
# Ver distribución y detectar visualmente outliers
boxplot(data_clean$price, horizontal = TRUE)
# Filtrar: eliminar el 1 % más alto
data_clean <- data_clean %>%
filter(price < quantile(price, 0.99, na.rm = TRUE))
objetivo unoooo
Se calcula el precio medio de los apartamentos en función del número de habitaciones para ver cómo evoluciona.
data_clean %>%
group_by(bedrooms) %>%
summarise(precio_medio = mean(price, na.rm = TRUE)) %>%
ggplot(aes(x = factor(bedrooms), y = precio_medio)) +
geom_col(fill = "steelblue") +
labs(title = "Precio medio por número de habitaciones",
x = "Habitaciones", y = "Precio medio (€)") +
theme_minimal()
Se muestra en el mismo gráfico la cantidad de alojamientos (barras) y el
precio medio (línea) por número de habitaciones.
library(ggplot2)
library(dplyr)
df_bedrooms <- data_clean %>%
group_by(bedrooms) %>%
summarise(
cantidad = n(),
precio_medio = mean(price, na.rm = TRUE)
)
ggplot(df_bedrooms, aes(x = factor(bedrooms))) +
geom_col(aes(y = cantidad), fill = "lightblue") +
geom_line(aes(y = precio_medio * 10, group = 1), color = "darkblue", size = 1) +
scale_y_continuous(
name = "Cantidad de alojamientos",
sec.axis = sec_axis(~./10, name = "Precio medio (€)")
) +
labs(title = "Habitaciones: Precio medio y cantidad", x = "Habitaciones") +
theme_minimal()
Se calcula el precio medio de los apartamentos en función del número de
baños para ver cómo evoluciona.
data_clean %>%
group_by(bathrooms) %>%
summarise(precio_medio = mean(price, na.rm = TRUE)) %>%
ggplot(aes(x = factor(bathrooms), y = precio_medio)) +
geom_col(fill = "darkorange") +
labs(title = "Precio medio por número de baños",
x = "Baños", y = "Precio medio (€)") +
theme_minimal()
Se muestra en el mismo gráfico la cantidad de baños (barras) y el precio
medio (línea) por número de habitaciones.
df_bathrooms <- data_clean %>%
group_by(bathrooms) %>%
summarise(
cantidad = n(),
precio_medio = mean(price, na.rm = TRUE)
)
ggplot(df_bathrooms, aes(x = factor(bathrooms))) +
geom_col(aes(y = cantidad), fill = "orange") +
geom_line(aes(y = precio_medio * 10, group = 1), color = "red", size = 1) +
scale_y_continuous(
name = "Cantidad de alojamientos",
sec.axis = sec_axis(~./10, name = "Precio medio (€)")
) +
labs(title = "Baños: Precio medio y cantidad", x = "Baños") +
theme_minimal()
Se calcula el precio medio de los apartamentos en función del número de
persona que pueden alojarse para ver cómo evoluciona.
data_clean %>%
group_by(accommodates) %>%
summarise(precio_medio = mean(price, na.rm = TRUE)) %>%
ggplot(aes(x = factor(accommodates), y = precio_medio)) +
geom_col(fill = "forestgreen") +
labs(title = "Precio medio por capacidad de alojamiento",
x = "Personas que puede alojar", y = "Precio medio (€)") +
theme_minimal()
Se muestra en el mismo gráfico la cantidad de persona que pueden
alojarse (barras) y el precio medio (línea) por número de
habitaciones.
df_accom <- data_clean %>%
group_by(accommodates) %>%
summarise(
cantidad = n(),
precio_medio = mean(price, na.rm = TRUE)
)
ggplot(df_accom, aes(x = factor(accommodates))) +
geom_col(aes(y = cantidad), fill = "darkseagreen") +
geom_line(aes(y = precio_medio * 10, group = 1), color = "darkgreen", size = 1) +
scale_y_continuous(
name = "Cantidad de alojamientos",
sec.axis = sec_axis(~./10, name = "Precio medio (€)")
) +
labs(title = "Capacidad: Precio medio y cantidad", x = "Personas que puede alojar") +
theme_minimal()
Se crea un modelo de regresión lineal (lm) donde la variable dependiente
es el logaritmo del precio (log(price)), y las variables independientes
son bedrooms, bathrooms y room_type.
library(dplyr)
library(stringr)
data_clean <- data_clean %>%
filter(price > 0, price < 2000)
# Paso 1: Eliminar símbolos como "$" o "€" y convertir a numérico
data_clean <- data_clean %>%
mutate(cleaning_fee = ifelse(cleaning_fee == "Faltante", NA, cleaning_fee),
cleaning_fee = as.numeric(str_replace_all(cleaning_fee, "[$€,]", "")))
data_clean$cleaning_fee[is.na(data_clean$cleaning_fee)] <- 0
# 2. Crear modelo con log(price) como variable dependiente
modelo_log <- lm(log(price) ~ bedrooms + bathrooms + room_type
, data = data_clean)
# 3. Mostrar el resumen del modelo
summary(modelo_log)
##
## Call:
## lm(formula = log(price) ~ bedrooms + bathrooms + room_type, data = data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.60284 -0.34839 -0.02681 0.33726 2.49892
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.106541 0.014410 284.98 <2e-16 ***
## bedrooms 0.167517 0.005893 28.43 <2e-16 ***
## bathrooms 0.148169 0.009251 16.02 <2e-16 ***
## room_typePrivate room -0.706542 0.010156 -69.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5364 on 16434 degrees of freedom
## Multiple R-squared: 0.4561, Adjusted R-squared: 0.456
## F-statistic: 4595 on 3 and 16434 DF, p-value: < 2.2e-16
Este código aplica un modelo de regresión PLS (Partial Least Squares) con validación cruzada para predecir el logaritmo del precio.
library(pls)
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.4.3
# 1. Preprocesamiento
data_pls <- data_clean %>%
select(price, bedrooms, bathrooms, room_type,accommodates) %>%
dummy_cols(select_columns = "room_type", remove_selected_columns = TRUE)
# 2. Definir matriz X y variable Y
X <- as.matrix(data_pls %>% select(-price))
Y <- log(data_pls$price)
# 3. Ajustar modelo PLS
modelo_pls <- plsr(Y ~ X, ncomp = 3, validation = "CV")
# 4. Resumen del modelo
summary(modelo_pls)
## Data: X dimension: 16438 5
## Y dimension: 16438 1
## Fit method: kernelpls
## Number of components considered: 3
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps
## CV 0.7273 0.5328 0.5124 0.511
## adjCV 0.7273 0.5328 0.5124 0.511
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps
## X 88.02 92.85 96.25
## Y 46.34 50.40 50.67
# 5. Validación cruzada
validationplot(modelo_pls, val.type = "MSEP") # Error cuadrático medio para cada número de componentes
Este fragmento aplica el modelo PLS ajustado para predecir precios y
transforma los resultados.
data_clean$log_precio_estimado_pls <- predict(modelo_pls, ncomp = 3)[, , 1]
data_clean$precio_estimado_pls <- round(exp(data_clean$log_precio_estimado_pls), 2)
data_clean$precio_estimado_pls <- pmin(data_clean$precio_estimado_pls, max(data_clean$price, na.rm = TRUE))
summary(data_clean$precio_estimado_pls)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.17 42.47 57.27 76.97 98.35 599.00
Se crea una tabla para comparar los precios reales con los estimados, se diferencia entre apartamentos enteros y privados. También una diferencia en valor absoluto para poder apreciar cuánto se va el modelo.
library(dplyr)
library(knitr)
precio_medio_distrito_tipo_pls <- data_clean %>%
group_by(distrito, room_type) %>%
summarise(
n_pisos = n(),
precio_medio_estimado_pls = round(mean(precio_estimado_pls, na.rm = TRUE), 2),
precio_medio_real = round(mean(price, na.rm = TRUE), 2),
diferencia_absoluta = round(abs(precio_medio_estimado_pls - precio_medio_real), 2)
) %>%
arrange(desc(precio_medio_estimado_pls))
## `summarise()` has grouped output by 'distrito'. You can override using the
## `.groups` argument.
kable(precio_medio_distrito_tipo_pls,
caption = "Precio medio estimado por noche (PLS) en cada distrito y tipo de habitación, con diferencia absoluta respecto al precio real")
| distrito | room_type | n_pisos | precio_medio_estimado_pls | precio_medio_real | diferencia_absoluta |
|---|---|---|---|---|---|
| Eixample | Entire home/apt | 3054 | 124.79 | 153.92 | 29.13 |
| Sarrià-Sant Gervasi | Entire home/apt | 357 | 116.08 | 133.97 | 17.89 |
| Sant Martí | Entire home/apt | 748 | 112.50 | 131.88 | 19.38 |
| Sant Andreu | Entire home/apt | 57 | 111.90 | 82.65 | 29.25 |
| Les Corts | Entire home/apt | 123 | 109.82 | 111.87 | 2.05 |
| Sants-Montjuïc | Entire home/apt | 920 | 109.41 | 114.86 | 5.45 |
| Gràcia | Entire home/apt | 705 | 109.28 | 119.09 | 9.81 |
| Horta-Guinardó | Entire home/apt | 162 | 105.78 | 111.77 | 5.99 |
| Ciutat Vella | Entire home/apt | 1781 | 97.63 | 99.67 | 2.04 |
| Nou Barris | Entire home/apt | 26 | 93.67 | 85.88 | 7.79 |
| Les Corts | Private room | 182 | 44.60 | 42.07 | 2.53 |
| Ciutat Vella | Private room | 2194 | 44.14 | 58.90 | 14.76 |
| Eixample | Private room | 2459 | 43.69 | 50.89 | 7.20 |
| Sant Martí | Private room | 1011 | 42.57 | 43.32 | 0.75 |
| Sants-Montjuïc | Private room | 985 | 42.49 | 46.18 | 3.69 |
| Gràcia | Private room | 665 | 42.02 | 46.82 | 4.80 |
| Nou Barris | Private room | 174 | 42.01 | 38.82 | 3.19 |
| Sant Andreu | Private room | 213 | 42.01 | 37.38 | 4.63 |
| Sarrià-Sant Gervasi | Private room | 274 | 41.85 | 47.42 | 5.57 |
| Horta-Guinardó | Private room | 348 | 41.69 | 40.11 | 1.58 |
Este bloque de código carga un shapefile con los límites geográficos de los distritos de Barcelona.Esta archivo representa los límites de los distritos.
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
distritos_sf <- st_read("0301040100_Districtes_UNITATS_ADM.shp")
## Reading layer `0301040100_Districtes_UNITATS_ADM' from data source
## `C:\Users\usuario\Desktop\Proyecto II\0301040100_Districtes_UNITATS_ADM.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 10 features and 46 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 420812.5 ymin: 4574282 xmax: 435480.4 ymax: 4591066
## Projected CRS: ETRS89 / UTM zone 31N
names(distritos_sf)
## [1] "ID_ANNEX" "ANNEXDESCR" "ID_TEMA" "TEMA_DESCR" "ID_CONJUNT"
## [6] "CONJ_DESCR" "ID_SUBCONJ" "SCONJ_DESC" "ID_ELEMENT" "ELEM_DESCR"
## [11] "NIVELL" "NDESCR_CA" "NDESCR_ES" "NDESCR_EN" "TERME"
## [16] "DISTRICTE" "BARRI" "AEB" "SEC_CENS" "GRANBARRI"
## [21] "ZUA" "AREA_I" "LITERAL" "PERIMETRE" "AREA"
## [26] "ORD_REPRES" "CODI_UA" "TIPUS_UA" "NOM" "WEB1"
## [31] "WEB2" "WEB3" "DOCUMENTA" "RANGESCALA" "TIPUS_POL"
## [36] "GRUIX_ID" "GRUIXDIMEN" "ESTIL_ID" "ESTIL_QGIS" "VALOR1QGIS"
## [41] "VALOR2QGIS" "COL_FARCIT" "FCOL_DESCR" "FHEX_COLOR" "COL_DESCR"
## [46] "HEX_COLOR7" "geometry"
head(distritos_sf)
## Simple feature collection with 6 features and 46 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 420812.5 ymin: 4574282 xmax: 433088.3 ymax: 4587641
## Projected CRS: ETRS89 / UTM zone 31N
## ID_ANNEX ANNEXDESCR ID_TEMA TEMA_DESCR ID_CONJUNT CONJ_DESCR
## 1 01 Grup - I 0104 Unitats Administratives 010412 Districtes
## 2 01 Grup - I 0104 Unitats Administratives 010412 Districtes
## 3 01 Grup - I 0104 Unitats Administratives 010412 Districtes
## 4 01 Grup - I 0104 Unitats Administratives 010412 Districtes
## 5 01 Grup - I 0104 Unitats Administratives 010412 Districtes
## 6 01 Grup - I 0104 Unitats Administratives 010412 Districtes
## ID_SUBCONJ SCONJ_DESC ID_ELEMENT ELEM_DESCR NIVELL
## 1 01041201 Districte 0104120101 Límit de districte ADM_02_PL
## 2 01041201 Districte 0104120101 Límit de districte ADM_02_PL
## 3 01041201 Districte 0104120101 Límit de districte ADM_02_PL
## 4 01041201 Districte 0104120101 Límit de districte ADM_02_PL
## 5 01041201 Districte 0104120101 Límit de districte ADM_02_PL
## 6 01041201 Districte 0104120101 Límit de districte ADM_02_PL
## NDESCR_CA NDESCR_ES
## 1 Límit de districte (polígon) Límite de distrito (polígono)
## 2 Límit de districte (polígon) Límite de distrito (polígono)
## 3 Límit de districte (polígon) Límite de distrito (polígono)
## 4 Límit de districte (polígon) Límite de distrito (polígono)
## 5 Límit de districte (polígon) Límite de distrito (polígono)
## 6 Límit de districte (polígon) Límite de distrito (polígono)
## NDESCR_EN TERME DISTRICTE BARRI AEB SEC_CENS GRANBARRI ZUA
## 1 District boundary (polygon) 080193 01 - - - - -
## 2 District boundary (polygon) 080193 02 - - - - -
## 3 District boundary (polygon) 080193 03 - - - - -
## 4 District boundary (polygon) 080193 04 - - - - -
## 5 District boundary (polygon) 080193 05 - - - - -
## 6 District boundary (polygon) 080193 06 - - - - -
## AREA_I LITERAL PERIMETRE AREA ORD_REPRES CODI_UA TIPUS_UA
## 1 - 01 21366.96 4204931 5 01 DISTRICTE
## 2 - 02 13931.64 7464303 5 02 DISTRICTE
## 3 - 03 46711.86 22879850 5 03 DISTRICTE
## 4 - 04 12551.60 6010769 5 04 DISTRICTE
## 5 - 05 35658.34 19910635 5 05 DISTRICTE
## 6 - 06 12482.17 4224278 5 06 DISTRICTE
## NOM WEB1
## 1 Ciutat Vella http://www.bcn.cat/ciutatvella
## 2 Eixample http://www.bcn.cat/eixample
## 3 Sants-Montjuïc http://www.bcn.cat/sants-montjuic
## 4 Les Corts http://www.bcn.cat/lescorts
## 5 Sarrià-Sant Gervasi http://www.bcn.cat/sarria-santgervasi
## 6 Gràcia http://www.bcn.cat/gracia
## WEB2
## 1 http://www.bcn.cat/estadistica/catala/dades/guiadt01/index.htm
## 2 http://www.bcn.cat/estadistica/catala/dades/guiadt02/index.htm
## 3 http://www.bcn.cat/estadistica/catala/dades/guiadt03/index.htm
## 4 http://www.bcn.cat/estadistica/catala/dades/guiadt04/index.htm
## 5 http://www.bcn.cat/estadistica/catala/dades/guiadt05/index.htm
## 6 http://www.bcn.cat/estadistica/catala/dades/guiadt06/index.htm
## WEB3
## 1 http://www.bcn.cat/estadistica/catala/documents/districtes/01_CiutatVella_2017.pdf
## 2 http://www.bcn.cat/estadistica/catala/documents/districtes/02_Eixample_2017.pdf
## 3 http://www.bcn.cat/estadistica/catala/documents/districtes/03_Sants_Montju%C3%AFc_2017.pdf
## 4 http://www.bcn.cat/estadistica/catala/documents/districtes/04_LesCorts_2017.pdf
## 5 http://www.bcn.cat/estadistica/catala/documents/districtes/05_Sarria_San%20Gervasi_2017.pdf
## 6 http://www.bcn.cat/estadistica/catala/documents/districtes/06_Gracia_2017.pdf
## DOCUMENTA RANGESCALA TIPUS_POL GRUIX_ID GRUIXDIMEN ESTIL_ID ESTIL_QGIS
## 1 <NA> 1-150000 <NA> 6 70 0 Sòlid
## 2 <NA> 1-150000 <NA> 6 70 0 Sòlid
## 3 <NA> 1-150000 <NA> 6 70 0 Sòlid
## 4 <NA> 1-150000 <NA> 6 70 0 Sòlid
## 5 <NA> 1-150000 <NA> 6 70 0 Sòlid
## 6 <NA> 1-150000 <NA> 6 70 0 Sòlid
## VALOR1QGIS VALOR2QGIS COL_FARCIT FCOL_DESCR FHEX_COLOR COL_DESCR HEX_COLOR7
## 1 0 0 1 Negre #000000 Negre #000000
## 2 0 0 1 Negre #000000 Negre #000000
## 3 0 0 1 Negre #000000 Negre #000000
## 4 0 0 1 Negre #000000 Negre #000000
## 5 0 0 1 Negre #000000 Negre #000000
## 6 0 0 1 Negre #000000 Negre #000000
## geometry
## 1 MULTIPOLYGON (((431733.7 45...
## 2 MULTIPOLYGON (((432033.2 45...
## 3 MULTIPOLYGON (((428773.9 45...
## 4 MULTIPOLYGON (((425054.7 45...
## 5 MULTIPOLYGON (((422515.6 45...
## 6 MULTIPOLYGON (((427827 4586...
Se crea un gráfico de calor en el que vemos que distritos son los más caros y los más baratos, para apartamentos enteros.
library(dplyr)
library(sf)
library(ggplot2)
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Cargando paquete requerido: viridisLite
## Warning: package 'viridisLite' was built under R version 4.4.3
##
## Adjuntando el paquete: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
# 1. Renombrar solo si hace falta
if (!"distrito" %in% colnames(distritos_sf)) {
distritos_sf <- distritos_sf %>% rename(distrito = NOM)
}
# 2. Filtrar precios PLS solo para 'Entire home/apt'
# 2. Filtrar precios PLS solo para 'Entire home/apt'
precio_medio_distrito_pls <- precio_medio_distrito_tipo_pls %>%
filter(room_type == "Entire home/apt") %>%
dplyr::select(distrito, precio_medio_estimado_pls)
# 3. Join espacial
mapa_distritos_pls <- left_join(distritos_sf, precio_medio_distrito_pls, by = "distrito")
# 4. Comprobar que no haya errores
sum(is.na(mapa_distritos_pls$precio_medio_estimado_pls)) # Debería dar 0
## [1] 0
# 5. Visualización
ggplot(mapa_distritos_pls) +
geom_sf(aes(fill = precio_medio_estimado_pls), color = "white") +
scale_fill_viridis_c(option = "plasma") +
labs(
title = "Precio medio estimado (PLS) por noche en cada distrito de Barcelona",
subtitle = "Solo 'Entire home/apt'",
fill = "€/noche"
) +
theme_minimal()
Se crea un gráfico de calor en el que vemos que distritos son los más
caros y los más baratos, para apartamentos con habitaciones
privadas.
precio_medio_distrito_pls_private <- precio_medio_distrito_tipo_pls %>%
filter(room_type == "Private room") %>%
select(distrito, precio_medio_estimado_pls)
mapa_distritos_pls_private <- left_join(distritos_sf, precio_medio_distrito_pls_private, by = "distrito")
ggplot(mapa_distritos_pls_private) +
geom_sf(aes(fill = precio_medio_estimado_pls), color = "white") +
scale_fill_viridis_c(option = "plasma") +
labs(
title = "Precio medio estimado (PLS) por noche - Habitaciones privadas",
fill = "€/noche"
) +
theme_minimal()
Este código prepara los datos para clustering usando solo el precio
estimado PLS
datos_cluster_pls <- data_clean %>%
filter(!is.na(precio_estimado_pls), !is.na(longitude), !is.na(latitude)) %>%
select(precio_estimado_pls, longitude, latitude)
datos_scaled_pls <- scale(datos_cluster_pls$precio_estimado_pls)
Vemos el número de clusters ideal.
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
##
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
km_opt <- function(x, k) kmeans(x, centers = k, nstart = 5)
p1 <- fviz_nbclust(x = datos_scaled_pls, FUNcluster = km_opt, method = "silhouette", k.max = 12) +
labs(title = "K-means - Silhouette")
p2 <- fviz_nbclust(x = datos_scaled_pls, FUNcluster = km_opt, method = "wss", k.max = 12) +
labs(title = "K-means - WSS")
grid.arrange(p1, p2, nrow = 1)
Aplicamos k-medias con 8 clusters.
set.seed(123)
modelo_kmeans <- kmeans(datos_scaled_pls, centers = 8, nstart = 25)
datos_cluster_pls$cluster_kmeans <- as.factor(modelo_kmeans$cluster)
Se pinta un mapa con los clusters hechos anteriormente.
ggplot(datos_cluster_pls, aes(x = longitude, y = latitude, color = cluster_kmeans)) +
geom_point(alpha = 0.7, size = 1.5) +
coord_fixed() +
scale_color_brewer(palette = "Set1") +
labs(
title = "Zonas económicas por clustering K-means",
subtitle = "Clusters creados SOLO con el precio estimado (PLS)",
color = "Grupo de precio"
) +
theme_minimal()
Vemos una tabla con los cluster hecho, en la que se ve el número de
pisos, el precio medio, el precio máx y el mín.
library(dplyr)
datos_cluster_pls %>%
group_by(cluster_kmeans) %>%
summarise(
n_pisos = n(),
precio_medio = round(mean(precio_estimado_pls), 2),
precio_min = round(min(precio_estimado_pls), 2),
precio_max = round(max(precio_estimado_pls), 2)
) %>%
arrange(precio_medio)
## # A tibble: 8 × 5
## cluster_kmeans n_pisos precio_medio precio_min precio_max
## <fct> <int> <dbl> <dbl> <dbl>
## 1 7 8015 41.8 33.2 55.6
## 2 2 1991 70.2 56.1 83.1
## 3 3 3409 96.2 83.4 111.
## 4 5 2031 126. 111. 147.
## 5 1 676 167. 147. 196.
## 6 6 219 227. 198. 277.
## 7 8 46 331. 281. 413.
## 8 4 51 544. 442. 599
Unimos el ranking al dataset original y lo convertimos a factor para poder usarlo posteriormente en gráficos.
ranking_clusters_pls <- datos_cluster_pls %>%
group_by(cluster_kmeans) %>%
summarise(precio_medio = mean(precio_estimado_pls)) %>%
arrange(precio_medio) %>%
mutate(cluster_rank = row_number()) # 1 = más barato, N = más caro
datos_cluster_pls <- datos_cluster_pls %>%
left_join(ranking_clusters_pls, by = "cluster_kmeans") %>%
mutate(cluster_rank = factor(cluster_rank))
Realizamos el gráfico ya con el ranking en el dataset.
library(ggplot2)
ggplot(datos_cluster_pls, aes(x = longitude, y = latitude, color = cluster_rank)) +
geom_point(alpha = 0.6, size = 1.2) +
coord_fixed() +
scale_color_brewer(palette = "YlOrRd", direction = 1) +
labs(
title = "Zonas económicas de Airbnb en Barcelona",
subtitle = "Ranking de precio medio estimado (1 = más barato, N = más caro)",
color = "Nivel de precio"
) +
theme_minimal()
En este gráfico resaltamos los niveles 6,7 y 8 para poder detectar algún
tipo de tendencia.
library(ggplot2)
library(dplyr)
ggplot() +
# Fondo atenuado con todos los puntos
geom_point(data = datos_cluster_pls, aes(x = longitude, y = latitude),
color = "grey85", alpha = 0.15, size = 0.4) +
# Clusters de nivel 6 y 7
geom_point(
data = filter(datos_cluster_pls, cluster_rank %in% c("6", "7")),
aes(x = longitude, y = latitude, color = cluster_rank),
size = 1.4, alpha = 0.8
) +
# Cluster de nivel 8 (el más caro)
geom_point(
data = filter(datos_cluster_pls, cluster_rank == "8"),
aes(x = longitude, y = latitude),
color = "#a50026", # rojo oscuro
size = 2.3, alpha = 0.95
) +
scale_color_manual(
values = c("6" = "#fdae61", "7" = "#f46d43"),
labels = c("6" = "Nivel 6", "7" = "Nivel 7")
) +
coord_fixed() +
labs(
title = "Zonas más caras de Airbnb en Barcelona",
subtitle = "Nivel 8 (el más caro) resaltado con más intensidad",
color = "Nivel de precio"
) +
theme_minimal()
Ahora vamos a hacer clustering pero solo con los apartamentos en los que
se alquila una habitación, “private room”.Quitamos las filas con precios
muy muy altos que se cargan nuestro clustering y pueden ser posible
outliers.
data_private <- data_clean %>%
filter(room_type == "Private room",
!is.na(precio_estimado_pls),
!is.na(longitude), !is.na(latitude)) %>%
select(precio_estimado_pls, longitude, latitude)
data_private <- data_private %>%
filter(precio_estimado_pls < quantile(precio_estimado_pls, 0.997)) # quitar top 1%
Escalamos los datos.
datos_scaled_private <- scale(data_private$precio_estimado_pls)
Con el coeficiente de silhouette vemos el número óptimo de clusters.
library(factoextra)
library(gridExtra)
km_opt <- function(x, k) kmeans(x, centers = k, nstart = 5)
p1 <- fviz_nbclust(x = datos_scaled_private, FUNcluster = km_opt, method = "silhouette", k.max = 12)
p2 <- fviz_nbclust(x = datos_scaled_private, FUNcluster = km_opt, method = "wss", k.max = 12)
grid.arrange(p1, p2, nrow = 1)
Aplicamos k-medias con 8 clusters.
set.seed(123)
modelo_kmeans_private <- kmeans(datos_scaled_private, centers = 8, nstart = 25)
data_private$cluster_kmeans <- as.factor(modelo_kmeans_private$cluster)
Una pequeña tabla como la de antes que muestre los clusters que se han creado.
library(dplyr)
data_private %>%
group_by(cluster_kmeans) %>%
summarise(
n_pisos = n(),
precio_medio = round(mean(precio_estimado_pls), 2),
precio_min = round(min(precio_estimado_pls), 2),
precio_max = round(max(precio_estimado_pls), 2)
) %>%
arrange(precio_medio)
## # A tibble: 8 × 5
## cluster_kmeans n_pisos precio_medio precio_min precio_max
## <fct> <int> <dbl> <dbl> <dbl>
## 1 8 2171 37.0 33.2 39.7
## 2 6 4186 42.5 39.7 43.3
## 3 5 1040 44.7 43.9 46.5
## 4 3 460 49.0 47.0 50.3
## 5 7 151 52.4 51.0 54.6
## 6 1 328 57.4 55.6 60.5
## 7 4 103 64.4 61.3 71.2
## 8 2 40 79.6 73.9 91.4
Agrupamos los pisos por clusters y con la media de su precios realizamos un ranking.
ranking_clusters_private <- data_private %>%
group_by(cluster_kmeans) %>%
summarise(precio_medio = mean(precio_estimado_pls)) %>%
arrange(precio_medio) %>%
mutate(cluster_rank = row_number())
data_private <- data_private %>%
left_join(ranking_clusters_private, by = "cluster_kmeans") %>%
mutate(cluster_rank = factor(cluster_rank))
Ralizamos el gráfico por clusters ordenados por precios, atendiendo solo a “private room”.
library(ggplot2)
ggplot(data_private, aes(x = longitude, y = latitude, color = cluster_rank)) +
geom_point(alpha = 0.6, size = 1.2) +
coord_fixed() +
scale_color_brewer(palette = "YlOrRd", direction = 1) +
labs(
title = "Zonas económicas de habitaciones privadas en Barcelona",
subtitle = "Ranking de precio estimado (1 = más barato, N = más caro)",
color = "Nivel de precio"
) +
theme_minimal()
Aquí,como antes resaltamos los cluster con mayores precios, que son el
6,7 y 8.
ggplot() +
geom_point(data = data_private, aes(x = longitude, y = latitude),
color = "grey85", alpha = 0.15, size = 0.4) +
geom_point(
data = filter(data_private, cluster_rank %in% c("6", "7")),
aes(x = longitude, y = latitude, color = cluster_rank),
size = 1.4, alpha = 0.8
) +
geom_point(
data = filter(data_private, cluster_rank == "8"),
aes(x = longitude, y = latitude),
color = "#a50026", size = 2.3, alpha = 0.95
) +
scale_color_manual(
values = c("6" = "#fdae61", "7" = "#f46d43"),
labels = c("6" = "Nivel 6", "7" = "Nivel 7")
) +
coord_fixed() +
labs(
title = "Zonas más caras de habitaciones privadas en Barcelona",
subtitle = "Nivel 8 (el más caro) resaltado con más intensidad",
color = "Nivel de precio"
) +
theme_minimal()
Este código selecciona los pisos más caros según el modelo de precios estimados y utiliza sus coordenadas para obtener sus direcciones reales con OpenStreetMap. Luego agrupa esos pisos por código postal y genera un ranking con los que tienen más presencia. A partir de ahí, se enlazan esas direcciones con los precios originales, y se crean mapas interactivos para los 9 códigos postales más frecuentes. En cada mapa se visualiza la ubicación exacta de los pisos caros junto con su precio estimado, permitiendo identificar visualmente en qué zonas concretas de la ciudad se concentran los alojamientos más exclusivos.
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.4.3
library(tidygeocoder)
## Warning: package 'tidygeocoder' was built under R version 4.4.3
pisos_caros <- datos_cluster_pls %>%
filter(cluster_rank == "8")
# Asegúrate de que las columnas se llaman lat y long
coordenadas <- pisos_caros %>%
select(latitude, longitude) %>%
rename(lat = latitude, long = longitude) %>%
head(10)
# 2. Seleccionar y renombrar columnas necesarias para la geocodificación
coordenadas <- pisos_caros %>%
select(latitude, longitude) %>%
rename(lat = latitude, long = longitude)
# 3. Geocodificación inversa con OpenStreetMap
calles <- reverse_geocode(
.tbl = coordenadas,
lat = lat,
long = long,
method = "osm",
full_results = TRUE
)
## Passing 51 coordinates to the Nominatim single coordinate geocoder
## Query completed in: 51.9 seconds
names(calles)
## [1] "lat" "long" "address" "place_id"
## [5] "licence" "osm_type" "osm_id" "osm_lat"
## [9] "osm_lon" "class" "type" "place_rank"
## [13] "importance" "addresstype" "name" "road"
## [17] "quarter" "suburb" "city" "county"
## [21] "province" "ISO3166-2-lvl6" "state" "ISO3166-2-lvl4"
## [25] "postcode" "country" "country_code" "boundingbox"
## [29] "neighbourhood" "house_number" "shop" "amenity"
## [33] "tourism" "office"
# 4. Enlazar direcciones con precios
calles_enriquecido <- calles %>%
left_join(pisos_caros, by = c("lat" = "latitude", "long" = "longitude"))
# 5. Comprobación de NAs en precio
cat("NAs tras el join:", sum(is.na(calles_enriquecido$precio_estimado_pls)), "\n")
## NAs tras el join: 0
# 6. Ranking de códigos postales
ranking_postcodes <- calles_enriquecido %>%
filter(!is.na(postcode)) %>%
group_by(postcode) %>%
summarise(n_pisos = n()) %>%
arrange(desc(n_pisos))
# 7. Extraer los más frecuentes
top_postcodes <- ranking_postcodes$postcode[1:9]
# 8. Crear mapas solo si hay datos válidos
for (i in 1:length(top_postcodes)) {
postcode_actual <- top_postcodes[i]
pisos_postcode <- calles_enriquecido %>%
filter(postcode == postcode_actual) %>%
filter(!is.na(lat), !is.na(long)) # Extra check
if (nrow(pisos_postcode) == 0) {
cat("⚠️ No hay datos para postcode:", postcode_actual, "\n")
next
}
cat("📍 Mostrando mapa para postcode:", postcode_actual, "\n")
leaflet(data = pisos_postcode) %>%
addTiles() %>%
addCircleMarkers(
lng = ~long, lat = ~lat,
popup = ~paste0(
"<b>Precio estimado (PLS): </b>", round(precio_estimado_pls, 2), " €<br>",
"<b>Postcode: </b>", postcode
),
radius = 6, color = "darkred", fillOpacity = 0.85
) %>%
addLegend(
position = "bottomright",
colors = "darkred",
labels = paste("Postcode", postcode_actual)
) %>%
setView(lng = mean(pisos_postcode$long), lat = mean(pisos_postcode$lat), zoom = 14) %>%
print()
}
## 📍 Mostrando mapa para postcode: 08010
## 📍 Mostrando mapa para postcode: 08013
## 📍 Mostrando mapa para postcode: 08037
## 📍 Mostrando mapa para postcode: 08007
## 📍 Mostrando mapa para postcode: 08025
## 📍 Mostrando mapa para postcode: 08009
## 📍 Mostrando mapa para postcode: 08014
## 📍 Mostrando mapa para postcode: 08001
## 📍 Mostrando mapa para postcode: 08011
Este código selecciona únicamente las habitaciones privadas más caras de Barcelona (las que pertenecen al cluster más alto según su precio estimado). A partir de sus coordenadas, se consulta su dirección real utilizando OpenStreetMap. Luego se agrupan por código postal y se identifican las zonas donde más se repiten estos pisos caros. Una vez hecho esto, se enlazan de nuevo con los precios originales y se crean mapas interactivos donde se puede ver, para cada uno de los 9 códigos postales más frecuentes, la ubicación exacta de cada habitación cara junto con su precio estimado. Esto permite visualizar con claridad dónde se concentran las zonas más exclusivas dentro de las Private room.
# 1. Filtrar los pisos más caros (cluster_rank 8) solo para Private room
pisos_caros_private <- data_private %>%
filter(cluster_rank == "8")
# 2. Renombrar lat/lon
coordenadas <- pisos_caros_private %>%
select(latitude, longitude) %>%
rename(lat = latitude, long = longitude)
# 3. Geocodificación inversa
calles_private <- reverse_geocode(
.tbl = coordenadas,
lat = lat,
long = long,
method = "osm",
full_results = TRUE
)
## Passing 40 coordinates to the Nominatim single coordinate geocoder
## Query completed in: 40.6 seconds
# 4. Unir coordenadas con precios originales
calles_private_enriquecido <- calles_private %>%
left_join(pisos_caros_private, by = c("lat" = "latitude", "long" = "longitude"))
# 5. Ranking de postcodes con más pisos
ranking_postcodes_private <- calles_private_enriquecido %>%
filter(!is.na(postcode)) %>%
group_by(postcode) %>%
summarise(n_pisos = n()) %>%
arrange(desc(n_pisos))
top_postcodes <- ranking_postcodes_private$postcode[1:9]
# 6. Visualizar mapas por cada postcode
for (i in 1:length(top_postcodes)) {
postcode_actual <- top_postcodes[i]
pisos_postcode <- calles_private_enriquecido %>%
filter(postcode == postcode_actual) %>%
filter(!is.na(lat), !is.na(long))
if (nrow(pisos_postcode) == 0) {
cat("⚠️ No hay datos para postcode:", postcode_actual, "\n")
next
}
cat("📍 Mostrando mapa para postcode:", postcode_actual, "\n")
leaflet(data = pisos_postcode) %>%
addTiles() %>%
addCircleMarkers(
lng = ~long, lat = ~lat,
popup = ~paste0(
"<b>Precio estimado (PLS): </b>", round(precio_estimado_pls, 2), " €<br>",
"<b>Postcode: </b>", postcode
),
radius = 6, color = "darkred", fillOpacity = 0.85
) %>%
addLegend(
position = "bottomright",
colors = "darkred",
labels = paste("Postcode", postcode_actual)
) %>%
setView(lng = mean(pisos_postcode$long), lat = mean(pisos_postcode$lat), zoom = 14) %>%
print()
}
## 📍 Mostrando mapa para postcode: 08002
## 📍 Mostrando mapa para postcode: 08015
## 📍 Mostrando mapa para postcode: 08011
## 📍 Mostrando mapa para postcode: 08012
## 📍 Mostrando mapa para postcode: 08001
## 📍 Mostrando mapa para postcode: 08007
## 📍 Mostrando mapa para postcode: 08014
## 📍 Mostrando mapa para postcode: 08029
## 📍 Mostrando mapa para postcode: 08003
Objetivo 2. Vamos a estudiar el precio relativo de las viviendas segun las varibles como ascensor, garaje … y ver si estas tienen relacion con el precio y por tanto explican bien el modelo
A patir de la variable amenities, en la que se encuantran distintas características que tiene el airbnb, sacamos variables relacionadas con la finca en la que se encuentra el airbnb, mediante cadenas de texto que si aparecen en amenities, el valor en la nueva variable será TRUE y si no FALSE.
Posteriormente las normalizamos por si fuese necesario a la hora de realizar el modelo.
Primero extraemos si en la cadena de caracteres de amenities aparece la palabra elevator o ascensor, para ver si la finca del airbnb tiene o no asensor
data_clean$ascensor <- grepl("elevator|ascensor", data_clean$amenities, ignore.case = TRUE)
data_clean$ascensor_normalizado <- ifelse(data_clean$ascensor == "TRUE", 1, 0)
ggplot(data_clean, aes(x = "", fill = as.factor(ascensor))) +
geom_bar(width = 1, stat = "count", color = "black") +
coord_polar(theta = "y") +
labs(title = "Distribución de apartamentos con y sin ascensor",
x = NULL, y = NULL) +
theme_void() +
theme(legend.title = element_blank()) +
geom_text(aes(label = scales::percent(..count../sum(..count..))),
stat = "count", position = position_stack(vjust = 0.5))
df_media_asc <- data_clean %>%
group_by(distrito, ascensor_normalizado)%>%
summarise(precio_medio = mean(price, na.rm = TRUE))
## `summarise()` has grouped output by 'distrito'. You can override using the
## `.groups` argument.
ggplot(df_media_asc, aes(x = distrito, y = precio_medio, fill = factor(ascensor_normalizado))) +
geom_col(position = "dodge") +
labs(title = "Precio medio por distrito según la variable ascensor",
x = "Distrito", y = "Precio medio", fill = "Ascensor") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
En segundo lugar hacemos lo mismo que en ascensor pero para aparcamiento, en amenities salen parking tanto externos a la fninca como internos, por tanto vamos solo a determinar si la finca tiene o no parking propio.
data_clean$aparcamiento <- grepl("paid parking off premises|free parking off premises|street parking", data_clean$amenities, ignore.case = TRUE)
data_clean$tiene_parking <- ifelse(
data_clean$aparcamiento == FALSE & grepl("parking", data_clean$amenities, ignore.case = TRUE),
TRUE,
FALSE
)
data_clean$garaje_propio_normalizado <- ifelse(data_clean$tiene_parking == "TRUE", 1, 0)
ggplot(data_clean, aes(x = "", fill = as.factor(tiene_parking))) +
geom_bar(width = 1, stat = "count", color = "black") +
coord_polar(theta = "y") +
labs(title = "Distribución de apartamentos con aparcamiento y sin",
x = NULL, y = NULL) +
theme_void() +
theme(legend.title = element_blank()) +
geom_text(aes(label = scales::percent(..count../sum(..count..))),
stat = "count", position = position_stack(vjust = 0.5))
df_media_pkn <- data_clean %>%
group_by(distrito, garaje_propio_normalizado)%>%
summarise(precio_medio = mean(price, na.rm = TRUE))
## `summarise()` has grouped output by 'distrito'. You can override using the
## `.groups` argument.
ggplot(df_media_pkn, aes(x = distrito, y = precio_medio, fill = factor(garaje_propio_normalizado))) +
geom_col(position = "dodge") +
labs(title = "Precio medio por distrito según la variable parking",
x = "Distrito", y = "Precio medio", fill = "Parking") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Por último, vamos ha realizar el mismo procedimiento pero con caracteristiscas no tan comunes como son si la finca tiene piscina o gym.
data_clean$adicionales <- grepl("gym|pool", data_clean$amenities, ignore.case = TRUE)
data_clean$adicionales_normalizado <- ifelse(data_clean$adicionales == "TRUE", 1, 0)
ggplot(data_clean, aes(x = "", fill = as.factor(adicionales))) +
geom_bar(width = 1, stat = "count", color = "black") +
coord_polar(theta = "y") +
labs(title = "Distribución de apartamentos con o sin piscina o gimnasio",
x = NULL, y = NULL) +
theme_void() +
theme(legend.title = element_blank()) +
geom_text(aes(label = scales::percent(..count../sum(..count..))),
stat = "count", position = position_stack(vjust = 0.5))
df_media_adi <- data_clean %>%
group_by(distrito, adicionales_normalizado)%>%
summarise(precio_medio = mean(price, na.rm = TRUE))
## `summarise()` has grouped output by 'distrito'. You can override using the
## `.groups` argument.
ggplot(df_media_adi, aes(x = distrito, y = precio_medio, fill = factor(adicionales_normalizado))) +
geom_col(position = "dodge") +
labs(title = "Precio medio por distrito según las variables adicionales",
x = "Distrito", y = "Precio medio", fill = "Adicionales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(dplyr)
tabla_relacion <- data_clean %>%
group_by(distrito) %>%
summarise(
n = n(),
precio_medio = round(mean(price, na.rm = TRUE), 2),
distancia_media = round(mean(Las_Ramblas, na.rm = TRUE), 2)
)
coef <- max(tabla_relacion$precio_medio) / max(tabla_relacion$distancia_media)
ggplot(tabla_relacion, aes(x = reorder(distrito, precio_medio))) +
geom_col(aes(y = precio_medio), fill = "steelblue") +
geom_line(aes(y = distancia_media * coef, group = 1), color = "firebrick", size = 1.2) +
geom_point(aes(y = distancia_media * coef), color = "firebrick", size = 2) +
scale_y_continuous(
name = "Precio medio",
sec.axis = sec_axis(~ . / coef, name = "Distancia media al centro")
) +
labs(title = "Precio medio (barras) y distancia media al centro (línea) por distrito",
x = "Distrito") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
MODELO OBJETIVO 2
En este modelo completamos el modelo del objetivo 1, añadiendo las variables pertenecientes a la finca extraidas anteriormente.
modelo_log2 <- lm(log(price) ~ ascensor + tiene_parking + adicionales + Las_Ramblas +
n_amenities + bedrooms + bathrooms + accommodates + room_type_normalizado, data = data_clean)
summary(modelo_log2)
##
## Call:
## lm(formula = log(price) ~ ascensor + tiene_parking + adicionales +
## Las_Ramblas + n_amenities + bedrooms + bathrooms + accommodates +
## room_type_normalizado, data = data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4122 -0.3298 -0.0201 0.3053 2.6969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.493896 0.014288 244.528 < 2e-16 ***
## ascensorTRUE 0.083711 0.008269 10.123 < 2e-16 ***
## tiene_parkingTRUE 0.008890 0.012959 0.686 0.49271
## adicionalesTRUE 0.098964 0.026515 3.732 0.00019 ***
## Las_Ramblas -0.071562 0.003215 -22.262 < 2e-16 ***
## n_amenities 0.001586 0.000381 4.164 3.14e-05 ***
## bedrooms -0.011510 0.007232 -1.591 0.11153
## bathrooms 0.043620 0.008941 4.879 1.08e-06 ***
## accommodates 0.147798 0.003681 40.147 < 2e-16 ***
## room_type_normalizado 0.457597 0.010996 41.613 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5017 on 16428 degrees of freedom
## Multiple R-squared: 0.5243, Adjusted R-squared: 0.5241
## F-statistic: 2012 on 9 and 16428 DF, p-value: < 2.2e-16
library(ggplot2)
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(dplyr)
coeficientes <- tidy(modelo_log2) %>%
filter(term != "(Intercept)")
ggplot(coeficientes, aes(x = reorder(term, estimate), y = estimate)) +
geom_col(fill = "skyblue") +
coord_flip() +
labs(title = "Direct Effect of Variables on log(Price)",
x = "Variables",
y = "Estimated Coefficient") +
theme_minimal()
data_clean$log_precio_estimado_finca <- predict(modelo_log2)
data_clean$precio_relativo_estimado_finca <- round(exp(data_clean$log_precio_estimado_finca), 2)
precio_medio_distrito_finca <- data_clean %>%
group_by(distrito) %>%
summarise(
n_pisos = n(),
precio_medio_estimado = round(mean(precio_relativo_estimado_finca, na.rm = TRUE), 2),
precio_medio_real = round(mean(price, na.rm = TRUE), 2)
) %>%
arrange(desc(precio_medio_estimado))
# Mostrar tabla
library(knitr)
kable(precio_medio_distrito_finca, caption = "Precio medio estimado por noche en cada distrito")
| distrito | n_pisos | precio_medio_estimado | precio_medio_real |
|---|---|---|---|
| Eixample | 5513 | 92.82 | 107.96 |
| Sarrià-Sant Gervasi | 631 | 79.47 | 96.39 |
| Gràcia | 1370 | 73.56 | 84.01 |
| Sants-Montjuïc | 1905 | 72.70 | 79.35 |
| Ciutat Vella | 3975 | 71.74 | 77.17 |
| Sant Martí | 1759 | 66.66 | 80.98 |
| Les Corts | 305 | 64.62 | 70.22 |
| Horta-Guinardó | 510 | 54.19 | 62.87 |
| Sant Andreu | 270 | 46.38 | 46.94 |
| Nou Barris | 200 | 37.15 | 44.94 |
Una vez vistos los resultados del modelo logarítmico y calculado de nuevo el precio relativo esta vez incluyendo las varibles de la finca, vemos que, en cuanto al modelo las variables nuevas que más efecto significativo tienen sobre el porecio son tener o no ascensor en la finca, en el que cuando hay ascensor en la finca el precio se ve incrementado en un 8,5% aproximadamente, y la distancia al centro de la ciudad de barcelona (Las_Ramblas), esta tiene relacion inversa, cuando mayor es la distancia menor es el porecio, el precio disminuye un 7% aproxidamente. En cuanto al resto de variables, las que hemos mantenido del modelo anterior siguen el mismo patron en cuanto a efecto, acommodates sigue siendo la más importante. Por último si nos fijamos en la comparación de los precios vemos que hay distritos en los que el precio relatiuvo ya esta bastante cerca del real, como en SAnt Andreu, lo que nos dice que hay zonas de barcelona donde estas variables son las que más efecto tienen a la hora de establecer el precio, pero en otros distritos aun no termina de estar completamente explicado, por lo que hay que seguir estudiando más variables.
Modelo PLS Objetivo 2
Ahora vamo a realizar otro tipo de modelo para comparar los resultados de ambos y corroborar el efecto de las variables
vars <- c("ascensor_normalizado", "garaje_propio_normalizado", "adicionales_normalizado", "Las_Ramblas",
"bedrooms", "bathrooms", "accommodates", "room_type_normalizado", "n_amenities","price")
data_pls <- data_clean %>%
select(all_of(vars)) %>%
filter(if_all(everything(), ~ !is.na(.)))
X <- data_pls %>% select(-price)
Y <-data_pls$price
modelo_pls_lineal <- plsr(Y ~ ., data = X, scale = TRUE, validation = "CV")
summary(modelo_pls_lineal)
## Data: X dimension: 16438 9
## Y dimension: 16438 1
## Fit method: kernelpls
## Number of components considered: 9
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 78.11 60.49 60.05 59.85 59.73 59.73 59.72
## adjCV 78.11 60.49 60.05 59.85 59.73 59.72 59.72
## 7 comps 8 comps 9 comps
## CV 59.72 59.72 59.72
## adjCV 59.72 59.72 59.72
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 30.83 43.35 50.73 55.58 63.56 69.65 79.21 89.09
## Y 40.06 40.96 41.37 41.60 41.61 41.61 41.61 41.61
## 9 comps
## X 100.00
## Y 41.61
plot(RMSEP(modelo_pls_lineal),
main = "Error de predicción (RMSEP) vs número de componentes",
legendpos = "topright")
loadings_df_1 <- as.data.frame(loadings(modelo_pls_lineal)[, 1])
loadings_df_1 $variable <- rownames(loadings_df_1)
colnames(loadings_df_1)[1] <- "Comp1"
ggplot(loadings_df_1, aes(x = reorder(variable, Comp1), y = Comp1)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Importancia de las variables en el primer componente PLS",
x = "Variable",
y = "Carga (Comp 1)"
) +
theme_minimal()
loadings(modelo_pls_lineal)
##
## Loadings:
## Comp 1 Comp 2 Comp 3 Comp 4 Comp 5 Comp 6 Comp 7
## ascensor_normalizado 0.114 -0.168 0.835 -0.685 0.443 -0.307 0.386
## garaje_propio_normalizado -0.536 0.259 0.137 -0.685 0.440 0.183
## adicionales_normalizado -0.326 0.587 -0.411 0.130 -0.263
## Las_Ramblas -0.527 0.277 -0.258 0.259
## bedrooms 0.530 -0.415 -0.294 0.328 0.195
## bathrooms 0.354 -0.167 -0.317 0.376 0.154 -0.694 0.428
## accommodates 0.558 0.116 0.173 0.341 0.306
## room_type_normalizado 0.463 0.213 0.243 -0.187 -0.548 -0.133 -0.362
## n_amenities 0.207 -0.645 0.132 0.388 -0.161 -0.573
## Comp 8 Comp 9
## ascensor_normalizado -0.250 0.314
## garaje_propio_normalizado 0.212 0.492
## adicionales_normalizado 0.693 -0.416
## Las_Ramblas -0.455 -0.636
## bedrooms -0.130
## bathrooms 0.329
## accommodates -0.100
## room_type_normalizado -0.181
## n_amenities -0.240 0.228
##
## Comp 1 Comp 2 Comp 3 Comp 4 Comp 5 Comp 6 Comp 7 Comp 8 Comp 9
## SS loadings 1.008 1.206 1.547 1.062 1.234 1.017 1.001 1.000 1.000
## Proportion Var 0.112 0.134 0.172 0.118 0.137 0.113 0.111 0.111 0.111
## Cumulative Var 0.112 0.246 0.418 0.536 0.673 0.786 0.897 1.008 1.119
data_clean$pls_precio_estimado_finca <- predict(modelo_pls_lineal, ncomp = 1)[, , 1]
data_clean$precio_estimado_pls_finca <- round(exp(data_clean$pls_precio_estimado_finca), 2)
data_clean$precio_estimado_pls_finca <- pmin(data_clean$precio_estimado_pls_finca, max(data_clean$price, na.rm = TRUE))
library(dplyr)
library(knitr)
pls_precio_medio_distrito <- data_clean %>%
group_by(distrito) %>%
summarise(
n_pisos = n(),
precio_medio_estimado_pls = round(mean(pls_precio_estimado_finca, na.rm = TRUE), 2),
precio_medio_real = round(mean(price, na.rm = TRUE), 2),
diferencia_absoluta = round(abs(precio_medio_estimado_pls - precio_medio_real), 2)
) %>%
arrange(desc(precio_medio_estimado_pls))
# 5. Mostrar tabla
kable(pls_precio_medio_distrito,
caption = "Precio medio estimado por noche (PLS) en cada distrito y tipo de habitación, con diferencia absoluta respecto al precio real")
| distrito | n_pisos | precio_medio_estimado_pls | precio_medio_real | diferencia_absoluta |
|---|---|---|---|---|
| Eixample | 5513 | 103.50 | 107.96 | 4.46 |
| Sarrià-Sant Gervasi | 631 | 97.86 | 96.39 | 1.47 |
| Gràcia | 1370 | 85.69 | 84.01 | 1.68 |
| Sants-Montjuïc | 1905 | 83.44 | 79.35 | 4.09 |
| Les Corts | 305 | 83.33 | 70.22 | 13.11 |
| Sant Martí | 1759 | 80.83 | 80.98 | 0.15 |
| Ciutat Vella | 3975 | 78.82 | 77.17 | 1.65 |
| Horta-Guinardó | 510 | 64.23 | 62.87 | 1.36 |
| Sant Andreu | 270 | 56.36 | 46.94 | 9.42 |
| Nou Barris | 200 | 45.30 | 44.94 | 0.36 |
if (!"distrito" %in% colnames(distritos_sf)) {
distritos_sf <- distritos_sf %>% rename(distrito = NOM)
}
mapa_distritos_pls_finca2 <- left_join(distritos_sf,pls_precio_medio_distrito , by = "distrito")
ggplot(mapa_distritos_pls_finca2) +
geom_sf(aes(fill = precio_medio_estimado_pls), color = "white") +
scale_fill_viridis_c(option = "plasma") +
labs(
title = "Precio estimado (PLS) por noche en los distrito de Barcelona",
fill = "€/noche"
) +
theme_minimal()
Para finalizar este objetivo vamo a ver que porcentaje afecta cada una de las variables en la variable precio.
r2_total <- summary(modelo_log2)$r.squared
vars <- c("ascensor", "tiene_parking", "adicionales", "Las_Ramblas",
"bedrooms", "bathrooms", "accommodates", "room_type_normalizado", "n_amenities" )
r2_sin_var <- sapply(vars, function(v) {
formula <- as.formula(paste("log(price) ~", paste(setdiff(vars, v), collapse = " + ")))
modelo_reducido <- lm(formula, data = data_clean)
summary(modelo_reducido)$r.squared
})
contribucion <- r2_total - r2_sin_var
porcentaje <- 100 * contribucion / sum(contribucion)
sort(round(porcentaje, 2), decreasing = TRUE)
## room_type_normalizado accommodates Las_Ramblas
## 43.30 40.30 12.39
## ascensor bathrooms n_amenities
## 2.56 0.60 0.43
## adicionales bedrooms tiene_parking
## 0.35 0.06 0.01
names(data_clean)
## [1] "id" "listing_url"
## [3] "name" "summary"
## [5] "space" "description"
## [7] "neighborhood_overview" "picture_url"
## [9] "host_id" "host_url"
## [11] "host_name" "host_since"
## [13] "host_response_time" "host_response_rate"
## [15] "host_is_superhost" "host_picture_url"
## [17] "host_neighbourhood" "host_listings_count"
## [19] "host_verifications" "host_has_profile_pic"
## [21] "host_identity_verified" "street"
## [23] "neighbourhood" "neighbourhood_cleansed"
## [25] "neighbourhood_group_cleansed" "city"
## [27] "zipcode" "country"
## [29] "latitude" "longitude"
## [31] "is_location_exact" "property_type"
## [33] "room_type" "accommodates"
## [35] "bathrooms" "bedrooms"
## [37] "beds" "amenities"
## [39] "price" "cleaning_fee"
## [41] "minimum_nights" "maximum_nights"
## [43] "has_availability" "availability_30"
## [45] "availability_60" "availability_90"
## [47] "availability_365" "number_of_reviews"
## [49] "number_of_reviews_ltm" "first_review"
## [51] "last_review" "review_scores_rating"
## [53] "review_scores_accuracy" "review_scores_cleanliness"
## [55] "review_scores_checkin" "review_scores_communication"
## [57] "review_scores_location" "review_scores_value"
## [59] "instant_bookable" "reviews_per_month"
## [61] "distrito" "criminalidad_distrito"
## [63] "city_clean" "num_paradas_bus"
## [65] "num_paradas_transport" "total_transporte_publico"
## [67] "total_centros_salud_distrito" "total_parques_jardines_distrito"
## [69] "Sagrada_Familia" "Las_Ramblas"
## [71] "Barceloneta" "Casa_Batllo"
## [73] "room_type_normalizado" "n_amenities"
## [75] "log_precio_estimado_pls" "precio_estimado_pls"
## [77] "ascensor" "ascensor_normalizado"
## [79] "aparcamiento" "tiene_parking"
## [81] "garaje_propio_normalizado" "adicionales"
## [83] "adicionales_normalizado" "log_precio_estimado_finca"
## [85] "precio_relativo_estimado_finca" "pls_precio_estimado_finca"
## [87] "precio_estimado_pls_finca"
respuesta <- data_clean$log_precio_estimado
head(data_clean)
## id listing_url
## 1 18666 https://www.airbnb.com/rooms/18666
## 2 18674 https://www.airbnb.com/rooms/18674
## 3 21605 https://www.airbnb.com/rooms/21605
## 4 25786 https://www.airbnb.com/rooms/25786
## 5 31377 https://www.airbnb.com/rooms/31377
## 6 31380 https://www.airbnb.com/rooms/31380
## name
## 1 Flat with Sunny Terrace
## 2 Huge flat for 8 people close to Sagrada Familia
## 3 Nice and sunny duble room
## 4 NICE ROOM AVAILABLE IN THE HEART OF GRACIA
## 5 Room for 2, Sagrada Famili
## 6 Room for 2-3. Barcelona, Sagrada Famili
## summary
## 1 Apartment located near the "Plaza de las Glorias" and the second-hand market (Encants). The accommodation is also close to the National Theatre of Catalunya and the Agbar Tower which has become one of the new symbols of Barcelona. Licence number: HUTB-(PHONE NUMBER HIDDEN)
## 2 110m2 apartment to rent in Barcelona. Located in the Eixample district, near the Sagrada Familia. It has a small balcony where you can see the temple of Gaudi. Capacity for 8 people. Licence number: HUTB-002062
## 3 The flat is in Poblenou district, and the room is a double room with a double bed, a wardrobe, a table, TV, wifi, heating and wood floor. Beautiful and charmy.
## 4 JUST GO THROUGH THE MANY REVIEWS I GOT THROUGH THE YEARS, NO BETTER FEEDBACK THAN THAT. WELCOME.
## 5 The room in 500 m from Sagrada Familia. 3 branches of the subway in 7 minutes of walking. A straight line to the center and the beach. A safe area area with the developed infrastructure. Completely equipped kitchen, a washing machine, an air conditioner. Private bathroom. Wi-fi free.
## 6 Room for 2-3. Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. Laundry place and kitchen at your disposition, air conditioner.. Wi-fi.free.
## space
## 1 Nice apartment situated on the penthouse floor of a building with elevator. Huge Living/dining-room with double sofa-bed 1 bedroom with two single beds 1 bedroom with double bed Nice kitchen opened to the living/dining-room and fully equipped for 6 people Bathroom with shower The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with air-conditioning and heating.
## 2 Apartment with 110 m2 located in the 6th floor in a building with elevator Huge living/dinig-room 1 double bedrrom 1 bedroom with 2 single beds 1 bedroom with bunk beds Kitchen fully equipped for 8 people 1 bathroom with bathtub 1 small bathroom with shower balcony The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with heating, air conditioning and wifi.
## 3 L'apartament està al barri de Poblenou, i l'habitació te un llit doble de (Phone number hidden by Airbnb) armari, una taula i cadira d'estudi, TV, wifi, calefacció i terra de parquet. Bonica i encantadora. A prop de la platja, Glòries, Sagrada Família, museu del disseny, els encants, rambla del Poblenou, torre Agbar, Forum. L'habitació no té clau
## 4 Room available for rent.- PEDRO PEREZ. Shared with a Catalan male aged 38, Ayurvedic massage therapist and Yoga practitioner. Looking for people non-smoking, enthusiastic willing to share more than just the space in a centric beautiful flat in PLaça Vila de Gracia. i am very flexible you can use anything in the house feel free to ask anything! The neighborhood is really special you could live here and not needing anything from outside, such an experience, just 100 years ago was a village in the outskirts of barcelona, we do have our own cultural program throughout the year, very Catalan place. The area is full of bohemians, artisans and modern artists. Most of the area has been taken over by us over the past 10 years making it a mix between the past and the present-future. Metro stations around are: Diagonal L3-L5, Fontana L3, Joanic L4, 10-15 minutes walking to city center Ramblas. Separate Wardrobe room available Kitchen and bathroom shared Bills included available for renti
## 5 Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. Private bathroom. Laundry place and kitchen at your disposition. Wi-fi internet. ХОРОШИЙ РАЙОН С РАЗВИТОЙ ИНФРАСТРУКТУРОЙ. РЯДОМ САГРАДА ФАМИЛИЯ, САН ПАУ, ПАРК ГУЭЛЬ, АВЕНИДА ГАУДИ. ОТ МЕТРО SAN PAU 3 МИН. ОТ ALFONS X 5 МИН - ПРЯМАЯ ВЕТКА ДО ПЛЯЖА. ЕВРОПЕЙСКИЕ УСЛОВИЯ. КОНДИЦИОНЕРЫ. БЕСПЛАТНЫЙ ИНТЕРНЕТ WI-FI ADSL. КОМПЬЮТЕР ПОЛНОСТЬЮ ОБОРУДОВАННАЯ КУХНЯ С ПОСУДОЙ. ЧАЙ, КОФЕ, САХАР - БЕСПЛАТНО. СТИРАЛЬНАЯ МАШИНА, УТЮГ, ОБОГРЕВ И Т.Д. ВЫДАЕМ ПОСТЕЛЬНОЕ БЕЛЬЕ, ПОЛОТЕНЦА, ВКЛЮЧАЯ ПЛЯЖНЫЕ, ФЕН, ШАМПУНЬ, ГЕЛЬ ДЛЯ ДУША.
## 6 Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. A shared bathroom. Laundry place and kitchen at your disposition. Wi-fi internet. ХОРОШИЙ РАЙОН С РАЗВИТОЙ ИНФРАСТРУКТУРОЙ. РЯДОМ САГРАДА ФАМИЛИЯ, САН ПАУ, ПАРК ГУЭЛЬ, АВЕНИДА ГАУДИ. ОТ МЕТРО SAN PAU 3 МИН. ОТ ALFONS X 5 МИН - ПРЯМАЯ ВЕТКА ДО ПЛЯЖА. ЕВРОПЕЙСКИЕ УСЛОВИЯ. КОНДИЦИОНЕРЫ (кроме комнаты на 1). БЕСПЛАТНЫЙ ИНТЕРНЕТ WI-FI ADSL. КОМПЬЮТЕР ПОЛНОСТЬЮ ОБОРУДОВАННАЯ КУХНЯ С ПОСУДОЙ. ЧАЙ, КОФЕ, САХАР - БЕСПЛАТНО. СТИРАЛЬНАЯ МАШИНА, УТЮГ, ОБОГРЕВ И Т.Д. ВЫДАЕМ ПОСТЕЛЬНОЕ БЕЛЬЕ, ПОЛОТЕНЦА, ВКЛЮЧАЯ ПЛЯЖНЫЕ, ФЕН, ШАМПУНЬ, ГЕЛЬ ДЛЯ ДУША.
## description
## 1 Apartment located near the "Plaza de las Glorias" and the second-hand market (Encants). The accommodation is also close to the National Theatre of Catalunya and the Agbar Tower which has become one of the new symbols of Barcelona. Licence number: HUTB-(PHONE NUMBER HIDDEN) Nice apartment situated on the penthouse floor of a building with elevator. Huge Living/dining-room with double sofa-bed 1 bedroom with two single beds 1 bedroom with double bed Nice kitchen opened to the living/dining-room and fully equipped for 6 people Bathroom with shower The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with air-conditioning and heating. Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included. We can provide you all kind of entrance and tickets for monuments and shows in Barcelona in order you avo
## 2 110m2 apartment to rent in Barcelona. Located in the Eixample district, near the Sagrada Familia. It has a small balcony where you can see the temple of Gaudi. Capacity for 8 people. Licence number: HUTB-002062 Apartment with 110 m2 located in the 6th floor in a building with elevator Huge living/dinig-room 1 double bedrrom 1 bedroom with 2 single beds 1 bedroom with bunk beds Kitchen fully equipped for 8 people 1 bathroom with bathtub 1 small bathroom with shower balcony The accommodation has been recently renovated and tastefully decorated with a comfortable furniture and wood floor. Also it is equipped with heating, air conditioning and wifi. Free Wifi - air conditioning. We will provide basic amenities like shower gel, shampoo,and hand soap. Also, 1 set of bed linen and towels per person will be included. We can provide you all kind of entrance and tickets for monuments and shows in Barcelona in order you avoid queues and plan your trip in advance. Also we can organize sh
## 3 The flat is in Poblenou district, and the room is a double room with a double bed, a wardrobe, a table, TV, wifi, heating and wood floor. Beautiful and charmy. L'apartament està al barri de Poblenou, i l'habitació te un llit doble de (Phone number hidden by Airbnb) armari, una taula i cadira d'estudi, TV, wifi, calefacció i terra de parquet. Bonica i encantadora. A prop de la platja, Glòries, Sagrada Família, museu del disseny, els encants, rambla del Poblenou, torre Agbar, Forum. L'habitació no té clau The kitchen is fully equipped and can use the washer and dryer. We also have a beautiful balcony on the apartment. And, of course, you can use the bathroom and the living and dining room. My husband and I will be available in person or by phone/ (Hidden by Airbnb) for any questions you have during your stay. Poblenou as one of the few areas that has grown independently, keeping away from fleeting trends and maintaining its identity. As a result it has become one of the most genuine and
## 4 JUST GO THROUGH THE MANY REVIEWS I GOT THROUGH THE YEARS, NO BETTER FEEDBACK THAN THAT. WELCOME. Room available for rent.- PEDRO PEREZ. Shared with a Catalan male aged 38, Ayurvedic massage therapist and Yoga practitioner. Looking for people non-smoking, enthusiastic willing to share more than just the space in a centric beautiful flat in PLaça Vila de Gracia. i am very flexible you can use anything in the house feel free to ask anything! The neighborhood is really special you could live here and not needing anything from outside, such an experience, just 100 years ago was a village in the outskirts of barcelona, we do have our own cultural program throughout the year, very Catalan place. The area is full of bohemians, artisans and modern artists. Most of the area has been taken over by us over the past 10 years making it a mix between the past and the present-future. Metro stations around are: Diagonal L3-L5, Fontana L3, Joanic L4, 10-15 minutes walking to city center Ramblas. S
## 5 The room in 500 m from Sagrada Familia. 3 branches of the subway in 7 minutes of walking. A straight line to the center and the beach. A safe area area with the developed infrastructure. Completely equipped kitchen, a washing machine, an air conditioner. Private bathroom. Wi-fi free. Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. Private bathroom. Laundry place and kitchen at your disposition. Wi-fi internet. ХОРОШИЙ РАЙОН С РАЗВИТОЙ ИНФРАСТРУКТУРОЙ. РЯДОМ САГРАДА ФАМИЛИЯ, САН ПАУ, ПАРК ГУЭЛЬ, АВЕНИДА ГАУДИ. ОТ МЕТРО SAN PAU 3 МИН. ОТ ALFONS X 5 МИН - ПРЯМАЯ ВЕТКА ДО ПЛЯЖА. ЕВРОПЕЙСКИЕ УСЛОВИЯ. КОНДИЦИОНЕРЫ. БЕСПЛАТНЫЙ ИНТЕРНЕТ WI-FI ADSL. КОМПЬЮТЕР ПОЛНОСТЬЮ ОБОРУДОВАННАЯ КУХНЯ С ПОСУДОЙ. ЧАЙ, КОФЕ, САХАР - БЕСПЛАТНО. СТИРАЛЬНАЯ МАШИНА, УТЮГ, ОБОГРЕВ И Т.Д. ВЫДАЕМ ПОСТЕЛЬНОЕ БЕЛЬЕ, ПОЛОТЕНЦА, ВК
## 6 Room for 2-3. Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. Laundry place and kitchen at your disposition, air conditioner.. Wi-fi.free. Great location, right next to Sagrada Familia, the symbol of Barcelona. Great infrastructure, a shopping area nearby. Next to two metro stations: Hospital de Sant Pau & Alfonso X. The room is in great condition, newly furnished. A shared bathroom. Laundry place and kitchen at your disposition. Wi-fi internet. ХОРОШИЙ РАЙОН С РАЗВИТОЙ ИНФРАСТРУКТУРОЙ. РЯДОМ САГРАДА ФАМИЛИЯ, САН ПАУ, ПАРК ГУЭЛЬ, АВЕНИДА ГАУДИ. ОТ МЕТРО SAN PAU 3 МИН. ОТ ALFONS X 5 МИН - ПРЯМАЯ ВЕТКА ДО ПЛЯЖА. ЕВРОПЕЙСКИЕ УСЛОВИЯ. КОНДИЦИОНЕРЫ (кроме комнаты на 1). БЕСПЛАТНЫЙ ИНТЕРНЕТ WI-FI ADSL. КОМПЬЮТЕР ПОЛНОСТЬЮ ОБОРУДОВАННАЯ КУХНЯ С ПОСУДОЙ. ЧАЙ, КОФЕ, САХАР - БЕСПЛАТНО. СТИРАЛЬНАЯ МАШИНА, УТЮГ,
## neighborhood_overview
## 1 Apartment in Barcelona near to the Plaza de las Glorias, the old market (Encants), the Agbar Tower one of the new symbols of Barcelona and the Teatre Nacional de Catalunya. All kinds of services in surroundings (shops, supermarkets, restaurants, bars).
## 2 Apartment in Barcelona located in the heart of Eixample district, within only 150 m form the great Sagrada Familia and really near of Gaudí Avenue and the famous Sant Pau Hospital . All kind of services in surroundings (shops, supermarkets, restaurants, bars).
## 3 Poblenou as one of the few areas that has grown independently, keeping away from fleeting trends and maintaining its identity. As a result it has become one of the most genuine and prolific metropolitan scenarios of Barcelona city. In recent years, a series of creative hubs have found their home in Poblenou, cultural and commercial spaces that offer similar innovative proposals, becoming part of the neighbourhood’s future without giving up its industrial past. To the mission of the neighbourhood’s normalization, the work of the entrepreneurs has been added, raising the area’s value and adding it to the map of alternative cultural circuits. See more info in (Website hidden by Airbnb)
## 4 Solo decir que a menudo ni salgo del barrio. Muy entretenido con sus gentes y lugares.
## 5 Faltante
## 6 Faltante
## picture_url
## 1 https://a0.muscache.com/im/pictures/47f88bc6-6561-445a-beec-f8ec4ddc1038.jpg?aki_policy=large
## 2 https://a0.muscache.com/im/pictures/13031453/413cdbfc_original.jpg?aki_policy=large
## 3 https://a0.muscache.com/im/pictures/774ca73d-13f4-4848-83c9-965d8332af8a.jpg?aki_policy=large
## 4 https://a0.muscache.com/im/pictures/6619f0c7-844e-40a1-8521-44c19b7a4af2.jpg?aki_policy=large
## 5 https://a0.muscache.com/im/pictures/ac805ead-12f0-4ebe-89b3-53ea9ede132f.jpg?aki_policy=large
## 6 https://a0.muscache.com/im/pictures/84928126/44a74321_original.jpg?aki_policy=large
## host_id host_url host_name host_since
## 1 71615 https://www.airbnb.com/users/show/71615 Mireia And Maria 19/01/2010
## 2 71615 https://www.airbnb.com/users/show/71615 Mireia And Maria 19/01/2010
## 3 82522 https://www.airbnb.com/users/show/82522 Meritxell 18/02/2010
## 4 108310 https://www.airbnb.com/users/show/108310 Pedro 14/04/2010
## 5 134698 https://www.airbnb.com/users/show/134698 Svetlana 29/05/2010
## 6 134698 https://www.airbnb.com/users/show/134698 Svetlana 29/05/2010
## host_response_time host_response_rate host_is_superhost
## 1 within an hour 99% f
## 2 within an hour 99% f
## 3 within a few hours 100% f
## 4 within an hour 100% t
## 5 within an hour 100% f
## 6 within an hour 100% f
## host_picture_url
## 1 https://a0.muscache.com/im/users/71615/profile_pic/1426612511/original.jpg?aki_policy=profile_x_medium
## 2 https://a0.muscache.com/im/users/71615/profile_pic/1426612511/original.jpg?aki_policy=profile_x_medium
## 3 https://a0.muscache.com/im/pictures/ece65ffd-a798-4209-b1b0-a51060412b29.jpg?aki_policy=profile_x_medium
## 4 https://a0.muscache.com/im/pictures/user/2b13f530-a8dd-4777-93a5-a133ac46b97d.jpg?aki_policy=profile_x_medium
## 5 https://a0.muscache.com/im/users/134698/profile_pic/1334849467/original.jpg?aki_policy=profile_x_medium
## 6 https://a0.muscache.com/im/users/134698/profile_pic/1334849467/original.jpg?aki_policy=profile_x_medium
## host_neighbourhood host_listings_count
## 1 El Camp de l'Arpa del Clot 45
## 2 El Camp de l'Arpa del Clot 45
## 3 El Poblenou 2
## 4 Vila de Gràcia 1
## 5 El Baix Guinardó 9
## 6 El Baix Guinardó 9
## host_verifications
## 1 ['email', 'phone', 'reviews', 'jumio', 'government_id']
## 2 ['email', 'phone', 'reviews', 'jumio', 'government_id']
## 3 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'government_id']
## 4 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']
## 5 ['email', 'phone', 'reviews']
## 6 ['email', 'phone', 'reviews']
## host_has_profile_pic host_identity_verified street
## 1 t t Barcelona, CT, Spain
## 2 t t Barcelona, CT, Spain
## 3 t t Barcelona, Catalunya, Spain
## 4 t t Barcelona, Barcelona, Spain
## 5 t f Barcelona, CT, Spain
## 6 t f Barcelona, CT, Spain
## neighbourhood neighbourhood_cleansed neighbourhood_group_cleansed
## 1 Sant Martí el Camp de l'Arpa del Clot Sant Martí
## 2 La Sagrada Família la Sagrada Família Eixample
## 3 Sant Martí el Poblenou Sant Martí
## 4 Vila de Gràcia la Vila de Gràcia Gràcia
## 5 Horta-Guinardó el Baix Guinardó Horta-Guinardó
## 6 Horta-Guinardó el Baix Guinardó Horta-Guinardó
## city zipcode country latitude longitude is_location_exact property_type
## 1 barcelona 8026 Spain 41.40889 2.18555 t Apartment
## 2 barcelona 8025 Spain 41.40420 2.17306 t Apartment
## 3 barcelona 8018 Spain 41.40560 2.19821 t Apartment
## 4 barcelona 8012 Spain 41.40145 2.15645 t Apartment
## 5 barcelona 8025 Spain 41.41097 2.17070 t Apartment
## 6 barcelona 8025 Spain 41.41090 2.17082 t Apartment
## room_type accommodates bathrooms bedrooms beds
## 1 Entire home/apt 6 1 2 4
## 2 Entire home/apt 8 2 3 6
## 3 Private room 2 1 1 1
## 4 Private room 2 1 1 1
## 5 Private room 2 1 1 2
## 6 Private room 3 1 1 2
## amenities
## 1 {TV,Internet,Wifi,"Air conditioning","Wheelchair accessible",Kitchen,Elevator,"Free street parking",Heating,"Family/kid friendly",Washer,Dryer,Essentials,Shampoo,"Hair dryer","Hot water","Host greets you","Paid parking on premises"}
## 2 {TV,Internet,Wifi,"Air conditioning","Wheelchair accessible",Kitchen,Elevator,"Free street parking","Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace",Crib,"Hot water","Host greets you","Paid parking on premises"}
## 3 {TV,Wifi,Kitchen,"Paid parking off premises",Elevator,Heating,"Family/kid friendly",Washer,Dryer,"First aid kit",Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace","Self check-in","Smart lock","Hot water","Bed linens","Extra pillows and blankets",Microwave,"Coffee maker",Refrigerator,Dishwasher,"Dishes and silverware","Cooking basics",Oven,Stove,"Patio or balcony","Luggage dropoff allowed","Cleaning before checkout","No stairs or steps to enter","Wide entrance for guests","Flat path to guest entrance","Well-lit path to entrance","No stairs or steps to enter","No stairs or steps to enter","No stairs or steps to enter","Wide entryway","Paid parking on premises"}
## 4 {TV,Wifi,"Air conditioning",Kitchen,"Smoking allowed",Elevator,Heating,"Family/kid friendly",Washer,"Fire extinguisher",Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer","Hot water","Luggage dropoff allowed"}
## 5 {Wifi,"Air conditioning",Kitchen,"Paid parking off premises","Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer",Iron,"translation missing: en.hosting_amenity_50","Bed linens","Extra pillows and blankets",Microwave,"Coffee maker",Refrigerator,"Dishes and silverware","Cooking basics",Oven,Stove,"Luggage dropoff allowed","Host greets you"}
## 6 {Wifi,"Air conditioning",Kitchen,"Paid parking off premises","Buzzer/wireless intercom",Heating,"Family/kid friendly",Washer,Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer",Iron,"Private living room","Hot water","Bed linens","Extra pillows and blankets",Microwave,"Coffee maker",Refrigerator,"Dishes and silverware","Cooking basics",Oven,Stove,"Host greets you"}
## price cleaning_fee minimum_nights maximum_nights has_availability
## 1 130 42 3 730 t
## 2 60 50 1 1125 t
## 3 33 0 2 1125 t
## 4 45 0 1 730 t
## 5 42 0 3 1125 t
## 6 53 0 3 1125 t
## availability_30 availability_60 availability_90 availability_365
## 1 0 0 0 182
## 2 3 20 50 129
## 3 4 8 15 15
## 4 8 19 41 115
## 5 5 8 16 211
## 6 3 8 15 211
## number_of_reviews number_of_reviews_ltm first_review last_review
## 1 1 0 10/10/2015 10/10/2015
## 2 15 10 27/05/2013 02/07/2019
## 3 119 36 08/05/2016 04/07/2019
## 4 241 49 11/08/2010 03/07/2019
## 5 4 0 20/05/2015 12/03/2018
## 6 40 2 20/06/2015 12/05/2019
## review_scores_rating review_scores_accuracy review_scores_cleanliness
## 1 80 10 10
## 2 87 9 9
## 3 90 10 9
## 4 95 10 10
## 5 95 9 10
## 6 87 9 9
## review_scores_checkin review_scores_communication review_scores_location
## 1 2 10 10
## 2 10 10 9
## 3 10 10 9
## 4 10 10 10
## 5 10 10 9
## 6 9 9 8
## review_scores_value instant_bookable reviews_per_month distrito
## 1 8 f 0.02 Sant Martí
## 2 8 t 0.20 Eixample
## 3 9 f 3.08 Sant Martí
## 4 9 t 2.22 Gràcia
## 5 9 f 0.08 Horta-Guinardó
## 6 9 f 0.81 Horta-Guinardó
## criminalidad_distrito city_clean num_paradas_bus num_paradas_transport
## 1 25408 Barcelona 356 70
## 2 46754 Barcelona 405 120
## 3 25408 Barcelona 356 70
## 4 8588 Barcelona 210 17
## 5 10057 Barcelona 389 31
## 6 10057 Barcelona 389 31
## total_transporte_publico total_centros_salud_distrito
## 1 426 252
## 2 525 535
## 3 426 252
## 4 227 193
## 5 420 188
## 6 420 188
## total_parques_jardines_distrito Sagrada_Familia Las_Ramblas Barceloneta
## 1 54 1.0979743 2.903276 3.236065
## 2 116 0.1151974 2.083905 3.152019
## 3 54 2.0057229 3.250696 2.824770
## 4 23 1.5136572 2.083059 3.885951
## 5 45 0.8567309 2.820638 3.899005
## 6 45 0.8459359 2.813136 3.887300
## Casa_Batllo room_type_normalizado n_amenities log_precio_estimado_pls
## 1 2.570126 1 18 4.819647
## 2 1.542214 1 22 5.171727
## 3 3.179456 0 41 3.748777
## 4 1.287301 0 17 3.748777
## 5 2.191177 0 26 3.748777
## 6 2.185826 0 26 3.898285
## precio_estimado_pls ascensor ascensor_normalizado aparcamiento tiene_parking
## 1 123.92 TRUE 1 TRUE FALSE
## 2 176.22 TRUE 1 TRUE FALSE
## 3 42.47 TRUE 1 TRUE FALSE
## 4 42.47 TRUE 1 FALSE FALSE
## 5 42.47 FALSE 0 TRUE FALSE
## 6 49.32 FALSE 0 TRUE FALSE
## garaje_propio_normalizado adicionales adicionales_normalizado
## 1 0 FALSE 0
## 2 0 FALSE 0
## 3 0 FALSE 0
## 4 0 FALSE 0
## 5 0 FALSE 0
## 6 0 FALSE 0
## log_precio_estimado_finca precio_relativo_estimado_finca
## 1 4.763387 117.14
## 2 5.156077 173.48
## 3 3.737730 42.00
## 4 3.783216 43.96
## 5 3.660999 38.90
## 6 3.809334 45.12
## pls_precio_estimado_finca precio_estimado_pls_finca
## 1 123.68704 599
## 2 179.28259 599
## 3 55.25931 599
## 4 48.78163 599
## 5 43.22801 599
## 6 51.47818 599
MODELO PLS OBJETIVO 3
En este primer bloque seleccionamos las variables que nos interesan para el análisis PLS. Estas incluyen tanto factores estructurales del piso —como número de habitaciones, camas o si tiene ascensor o garaje— como características del barrio, por ejemplo, centros de salud, transporte público o nivel de criminalidad.
A continuación, eliminamos las filas con valores perdidos para evitar problemas en el ajuste del modelo.
Después separamos las variables predictoras (X) de la variable respuesta, que en este caso es el precio del alojamiento (price).
Finalmente, ajustamos un modelo PLS con validación cruzada para determinar el número óptimo de componentes que mejor explica la variabilidad del precio.
En este segundo bloque mostramos el resumen del modelo ajustado, que nos da información sobre la varianza explicada y la carga de las variables.
También generamos el gráfico del RMSEP, que representa el error de predicción frente al número de componentes. Este gráfico nos ayuda a decidir cuántos componentes usar en el modelo, buscando el punto donde el error se estabiliza o deja de disminuir significativamente.
vars_pls <- c(
"ascensor_normalizado", "garaje_propio_normalizado",
"total_transporte_publico", "total_centros_salud_distrito",
"total_parques_jardines_distrito", "criminalidad_distrito",
"bedrooms", "bathrooms", "accommodates", "beds",
"Las_Ramblas","price","room_type_normalizado", "Las_Ramblas",
"Sagrada_Familia", "Barceloneta", "Casa_Batllo"
)
data_pls <- data_clean %>%
select(all_of(vars_pls)) %>%
filter(if_all(everything(), ~ !is.na(.)))
X <- data_pls %>% select(-price)
Y <- log(data_pls$price)
modelo_pls <- plsr(Y ~ ., data = X, scale = TRUE, validation = "CV")
summary(modelo_pls)
## Data: X dimension: 16438 15
## Y dimension: 16438 1
## Fit method: kernelpls
## Number of components considered: 15
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.7273 0.533 0.5165 0.5068 0.5051 0.5023 0.501
## adjCV 0.7273 0.533 0.5165 0.5068 0.5050 0.5023 0.501
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 0.5005 0.5004 0.5003 0.5001 0.4999 0.4999 0.4998
## adjCV 0.5005 0.5004 0.5003 0.5001 0.4999 0.4998 0.4997
## 14 comps 15 comps
## CV 0.4997 0.4996
## adjCV 0.4997 0.4996
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 27.04 45.64 53.62 69.92 74.18 77.11 81.03 84.85
## Y 46.33 49.62 51.53 51.86 52.38 52.65 52.73 52.76
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 88.88 91.38 93.37 98.06 98.75 99.81 100.00
## Y 52.78 52.81 52.86 52.87 52.89 52.90 52.91
plot(RMSEP(modelo_pls), main = "Error de predicción vs número de componentes")
loadings(modelo_pls)
##
## Loadings:
## Comp 1 Comp 2 Comp 3 Comp 4 Comp 5 Comp 6
## ascensor_normalizado -0.179 0.101 0.294 -0.677
## garaje_propio_normalizado -0.130 0.534
## total_transporte_publico 0.152 -0.403 0.483 -0.264 0.124
## total_centros_salud_distrito 0.224 -0.520 0.126 0.248 -0.184 0.158
## total_parques_jardines_distrito 0.214 -0.512 0.127 0.291 -0.213
## criminalidad_distrito 0.172 -0.369 0.292 -0.348
## bedrooms 0.419 0.147 -0.358 -0.348
## bathrooms 0.266 -0.583 0.813 -0.434
## accommodates 0.450 0.199 -0.107 0.446
## beds 0.430 0.165 -0.307 -0.254
## Las_Ramblas -0.174 0.216 -0.388 0.533 -0.343 0.253
## room_type_normalizado 0.365 0.287 0.450 0.138 -0.195
## Sagrada_Familia -0.157 0.225 -0.140 0.221 -0.441
## Barceloneta -0.109 -0.336 0.593 -0.287
## Casa_Batllo -0.217 0.296 -0.360 0.401 -0.274 0.161
## Comp 7 Comp 8 Comp 9 Comp 10 Comp 11 Comp 12
## ascensor_normalizado 0.637 0.405 -0.717 0.545 -0.560 0.342
## garaje_propio_normalizado -0.663 1.178 -0.895 0.322
## total_transporte_publico -0.142 -0.189 0.336 -0.116
## total_centros_salud_distrito 0.202 0.164
## total_parques_jardines_distrito -0.101 -0.196 -0.197
## criminalidad_distrito 0.117 -0.182 -0.137 0.443 0.190 -0.487
## bedrooms 0.123 0.293 0.282 0.125 -0.272 -0.109
## bathrooms -0.168 0.214 -0.115
## accommodates 0.242 -0.183
## beds -0.322 -0.254 -0.248 0.472
## Las_Ramblas -0.286 0.524 -0.235
## room_type_normalizado -0.304 0.137 0.210
## Sagrada_Familia 0.279 0.352 -0.102 -0.440 1.219 -0.729
## Barceloneta -0.108 0.219 0.252 -0.585 -0.181 0.186
## Casa_Batllo -0.384 -0.177 0.620 -0.319
## Comp 13 Comp 14 Comp 15
## ascensor_normalizado
## garaje_propio_normalizado
## total_transporte_publico -0.292 0.662 -0.679
## total_centros_salud_distrito -0.209 -0.417 0.164
## total_parques_jardines_distrito -0.105 0.556
## criminalidad_distrito 1.142 -0.441 -0.125
## bedrooms -0.174 0.250
## bathrooms
## accommodates 0.157
## beds -0.167
## Las_Ramblas 0.224 0.332
## room_type_normalizado
## Sagrada_Familia 0.146
## Barceloneta 0.904 -0.543
## Casa_Batllo -0.136 -0.173
##
## Comp 1 Comp 2 Comp 3 Comp 4 Comp 5 Comp 6 Comp 7 Comp 8 Comp 9
## SS loadings 1.041 1.231 1.321 1.464 1.302 1.512 1.173 2.216 1.652
## Proportion Var 0.069 0.082 0.088 0.098 0.087 0.101 0.078 0.148 0.110
## Cumulative Var 0.069 0.151 0.240 0.337 0.424 0.525 0.603 0.751 0.861
## Comp 10 Comp 11 Comp 12 Comp 13 Comp 14 Comp 15
## SS loadings 2.075 2.390 1.111 2.401 1.209 1.000
## Proportion Var 0.138 0.159 0.074 0.160 0.081 0.067
## Cumulative Var 0.999 1.159 1.233 1.393 1.473 1.540
En el último bloque identificamos el número óptimo de componentes a partir del gráfico anterior, y extraemos las cargas o loadings del modelo, que nos indican qué variables tienen más peso o importancia en la construcción de los componentes.
Luego, representamos gráficamente esas cargas para el primer componente. Este gráfico nos permite visualizar de forma clara qué variables están influyendo más en el modelo —por ejemplo, si el número de camas o la cercanía a lugares como Las Ramblas tienen mayor peso que otras variables.
Esta interpretación es fundamental para entender qué factores están más asociados al precio en nuestro análisis con PLS.
opt_comp <- which.min(RMSEP(modelo_pls)$val[1, , -1])
loadings_df <- as.data.frame(loadings(modelo_pls)[, 1:opt_comp])
loadings_df$variable <- rownames(loadings_df)
colnames(loadings_df)[1] <- "Comp1"
library(ggplot2)
ggplot(loadings_df, aes(x = reorder(variable, -Comp1), y = Comp1)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(
title = "Importancia de las variables en el primer componente PLS",
x = "Variable",
y = "Carga (Comp 1)"
) +
coord_flip() +
theme_minimal()
data_clean$pls_log_pred_nuevo <- predict(modelo_pls, ncomp = 1)[, , 1]
data_clean$pls_precio_estimado_nuevo <- round(exp(data_clean$pls_log_pred_nuevo), 2)
data_clean$pls_precio_estimado_nuevo <- pmin(data_clean$pls_precio_estimado_nuevo, max(data_clean$price, na.rm = TRUE))
library(dplyr)
pls_precio_medio_distrito_nuevo <- data_clean %>%
group_by(distrito) %>%
summarise(
n_pisos = n(),
precio_medio_estimado_pls = round(mean(pls_precio_estimado_nuevo, na.rm = TRUE), 2),
precio_medio_real = round(mean(price, na.rm = TRUE), 2),
diferencia_absoluta = round(abs(precio_medio_estimado_pls - precio_medio_real), 2)
)
if (!"distrito" %in% colnames(distritos_sf)) {
distritos_sf <- distritos_sf %>% rename(distrito = NOM)
}
mapa_distritos_pls_nuevo <- left_join(distritos_sf, pls_precio_medio_distrito_nuevo, by = "distrito")
ggplot(mapa_distritos_pls_nuevo) +
geom_sf(aes(fill = precio_medio_estimado_pls), color = "white") +
scale_fill_viridis_c(option = "plasma", name = "€/noche") +
labs(
title = "Precio estimado (PLS) por noche en los distritos de Barcelona",,
) +
theme_minimal()
Para explorar la relación entre las variables del entorno y las características de los pisos con el precio, he utilizado un modelo PLS o de mínimos cuadrados parciales. Este modelo es especialmente útil cuando tenemos muchas variables predictoras, algunas de ellas posiblemente correlacionadas entre sí. Además, permite obtener componentes latentes que explican tanto la variabilidad de las variables independientes como de la respuesta.En primer lugar, seleccioné un conjunto de variables tanto del entorno —como el número de medios de transporte, centros de salud, parques, criminalidad, y si tiene ascensor o garaje— como características internas del piso —número de habitaciones, camas, baños, capacidad, tipo de habitación, y si está cerca de Las Ramblas—.Luego filtré aquellas observaciones sin datos faltantes para poder ajustar correctamente el modelo. Utilicé validación cruzada para determinar cuántos componentes utilizar. En el gráfico de la izquierda (mostrando el RMSEP), observamos que el error de predicción desciende drásticamente hasta el componente 6, a partir del cual se estabiliza. Por tanto, seleccionamos 6 componentes como número óptimo para evitar sobreajuste sin perder capacidad explicativa. En el gráfico de barras (segunda imagen), representamos la importancia relativa de cada variable en el primer componente PLS, que es el que más varianza explica en la variable dependiente (el precio). Observamos que las variables con mayor carga positiva son accommodates, beds, bedrooms, bathrooms y la cercanía a Las Ramblas. Esto indica que los pisos con mayor capacidad, más camas y habitaciones, y ubicados cerca de Las Ramblas tienden a tener precios más altos. En cambio, el tipo de habitación “Private room” o “Shared room” tienen una carga negativa, lo que sugiere una relación inversa con el precio (es decir, son más baratos). Como se muestra en la tabla de resumen, los seis primeros componentes explican aproximadamente el 69% de la varianza acumulada en las variables predictoras. Esto implica que el modelo está capturando una buena parte de la información relevante para predecir el precio. En resumen, el modelo PLS nos ha permitido identificar qué características estructurales y del entorno influyen más en el precio de los pisos. Podemos concluir que la capacidad del alojamiento, el tipo de habitación y la ubicación (por ejemplo, cercanía a Las Ramblas) son factores clave en la estimación del precio. Este análisis puede ser útil para propietarios o plataformas que buscan predecir o ajustar precios de forma más eficiente teniendo en cuenta tanto el interior del piso como su contexto urbano.
library(sf)
library(ggplot2)
barcelona <- st_read("0301040100_Districtes_UNITATS_ADM.shp")
## Reading layer `0301040100_Districtes_UNITATS_ADM' from data source
## `C:\Users\usuario\Desktop\Proyecto II\0301040100_Districtes_UNITATS_ADM.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 10 features and 46 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 420812.5 ymin: 4574282 xmax: 435480.4 ymax: 4591066
## Projected CRS: ETRS89 / UTM zone 31N
lon <- c(2.1743128817274022, 2.169795139397786, 2.193627948808803, 2.164892558555271)
lat <- c(41.40376667643144, 41.385640869698186, 41.38045855785224, 41.391774395024854)
nombres <- c("Sagrada Familia", "Las Ramblas", "Barceloneta", "Casa Batlló")
puntos <- data.frame(lon = lon, lat = lat, nombre = nombres)
puntos_sf <- st_as_sf(puntos, coords = c("lon", "lat"), crs = 4326)
puntos_sf <- st_transform(puntos_sf, st_crs(barcelona))
ggplot() +
geom_sf(data = barcelona, fill = "white", color = "black") +
geom_sf(data = puntos_sf, color = "red", size = 3) +
geom_sf_text(data = puntos_sf, aes(label = nombre), nudge_y = 50, size = 3) +
theme_minimal() +
ggtitle("Puntos rojos con nombres en Barcelona")
OBJETIVO 4 : En este objetvo veremos la relacion entre el precio y las variables de valoracion de los usuarios.
data_clean$price <- as.numeric(gsub("[\\$,]", "", data_clean$price))
vars <- c("price",
"review_scores_rating",
"review_scores_accuracy",
"review_scores_cleanliness",
"review_scores_checkin",
"review_scores_communication",
"review_scores_location",
"review_scores_value")
cor_matrix <- cor(data_clean[, vars], use = "pairwise.complete.obs")
cor_with_price <- cor_matrix["price", ]
print(cor_with_price)
## price review_scores_rating
## 1.000000000 -0.065433171
## review_scores_accuracy review_scores_cleanliness
## -0.068932675 -0.002679966
## review_scores_checkin review_scores_communication
## -0.122302659 -0.072981065
## review_scores_location review_scores_value
## -0.023857376 -0.097146040
vars <- c("review_scores_rating",
"review_scores_accuracy",
"review_scores_cleanliness",
"review_scores_checkin",
"review_scores_communication",
"review_scores_location",
"review_scores_value")
results <- data.frame(
variable = character(),
correlation = numeric(),
p_value = numeric(),
significant = logical(),
stringsAsFactors = FALSE
)
for (v in vars) {
test <- cor.test(data_clean$price, data_clean[[v]], use = "pairwise.complete.obs")
results <- rbind(results, data.frame(
variable = v,
correlation = test$estimate,
p_value = test$p.value,
significant = test$p.value < 0.05
))
}
print(results)
## variable correlation p_value significant
## cor review_scores_rating -0.065433171 4.557012e-17 TRUE
## cor1 review_scores_accuracy -0.068932675 8.929020e-19 TRUE
## cor2 review_scores_cleanliness -0.002679966 7.311659e-01 FALSE
## cor3 review_scores_checkin -0.122302659 8.245443e-56 TRUE
## cor4 review_scores_communication -0.072981065 7.340277e-21 TRUE
## cor5 review_scores_location -0.023857376 2.220955e-03 TRUE
## cor6 review_scores_value -0.097146040 9.156337e-36 TRUE
library(dplyr)
results <- data_clean %>%
filter(room_type == "Entire home/apt") %>%
summarise(across(all_of(vars),
~ list(cor.test(price, .x,
use = "pairwise.complete.obs",
method = "spearman")),
.names = "test_{col}")) %>%
pivot_longer(everything(),
names_to = "variable",
values_to = "test") %>%
mutate(
correlation = map_dbl(test, ~ unname(.x$estimate)),
p_value = map_dbl(test, ~ .x$p.value),
significant = p_value < 0.05
) %>%
select(variable, correlation, p_value, significant)
## Warning: There were 7 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `across(...)`.
## Caused by warning in `cor.test.default()`:
## ! Cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6 remaining warnings.
print(results)
## # A tibble: 7 × 4
## variable correlation p_value significant
## <chr> <dbl> <dbl> <lgl>
## 1 test_review_scores_rating -0.0302 7.11e- 3 TRUE
## 2 test_review_scores_accuracy -0.0941 4.53e-17 TRUE
## 3 test_review_scores_cleanliness 0.0218 5.22e- 2 FALSE
## 4 test_review_scores_checkin -0.111 4.60e-23 TRUE
## 5 test_review_scores_communication -0.0842 5.76e-14 TRUE
## 6 test_review_scores_location -0.0208 6.43e- 2 FALSE
## 7 test_review_scores_value -0.00924 4.10e- 1 FALSE
library(dplyr)
results <- data_clean %>%
filter(room_type == "Private room") %>%
summarise(across(all_of(vars),
~ list(cor.test(price, .x, use = "pairwise.complete.obs",
method = "spearman")),
.names = "test_{col}")) %>%
tidyr::pivot_longer(everything(),
names_to = "variable",
values_to = "test") %>%
mutate(
correlation = map_dbl(test, ~ unname(.x$estimate)),
p_value = map_dbl(test, ~ .x$p.value),
significant = p_value < 0.05
) %>%
select(variable, correlation, p_value, significant)
## Warning: There were 7 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `across(...)`.
## Caused by warning in `cor.test.default()`:
## ! Cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6 remaining warnings.
print(results)
## # A tibble: 7 × 4
## variable correlation p_value significant
## <chr> <dbl> <dbl> <lgl>
## 1 test_review_scores_rating 0.0449 3.49e- 5 TRUE
## 2 test_review_scores_accuracy 0.0261 1.59e- 2 TRUE
## 3 test_review_scores_cleanliness 0.0283 8.99e- 3 TRUE
## 4 test_review_scores_checkin 0.0209 5.44e- 2 FALSE
## 5 test_review_scores_communication 0.0494 5.14e- 6 TRUE
## 6 test_review_scores_location 0.121 6.02e-29 TRUE
## 7 test_review_scores_value -0.0108 3.19e- 1 FALSE
library(dplyr)
casa_entera <- data_clean %>%
filter(room_type == "Entire home/apt")
score_vars_1_a_10_apartamentos <- c(
"review_scores_accuracy",
"review_scores_checkin",
"review_scores_communication"
)
for (varname in score_vars_1_a_10_apartamentos) {
cat("Conteo de valores para la variable:", varname, "\n")
conteos <- casa_entera %>%
filter(!is.na(.data[[varname]])) %>%
group_by(.data[[varname]]) %>%
summarise(Count = n(), .groups = "drop") %>%
arrange(.data[[varname]])
print(conteos)
cat("\n--------------------------------------------\n\n")
}
## Conteo de valores para la variable: review_scores_accuracy
## # A tibble: 9 × 2
## review_scores_accuracy Count
## <dbl> <int>
## 1 2 24
## 2 3 1
## 3 4 19
## 4 5 13
## 5 6 82
## 6 7 128
## 7 8 471
## 8 9 2133
## 9 10 5062
##
## --------------------------------------------
##
## Conteo de valores para la variable: review_scores_checkin
## # A tibble: 9 × 2
## review_scores_checkin Count
## <dbl> <int>
## 1 2 26
## 2 3 1
## 3 4 13
## 4 5 14
## 5 6 63
## 6 7 94
## 7 8 442
## 8 9 1634
## 9 10 5646
##
## --------------------------------------------
##
## Conteo de valores para la variable: review_scores_communication
## # A tibble: 9 × 2
## review_scores_communication Count
## <dbl> <int>
## 1 2 26
## 2 3 1
## 3 4 20
## 4 5 15
## 5 6 57
## 6 7 59
## 7 8 350
## 8 9 1669
## 9 10 5736
##
## --------------------------------------------
library(dplyr)
habitacion <- data_clean %>%
filter(room_type == "Private room")
score_vars_1_a_10_habitaciones <- c(
"review_scores_cleanliness",
"review_scores_location",
"review_scores_communication",
"review_scores_accuracy"
)
for (varname in score_vars_1_a_10_habitaciones) {
cat("Conteo de valores para la variable:", varname, "\n")
if (!varname %in% names(habitacion)) {
cat(" ¡Atención!: la variable '", varname, "' NO existe en el data.frame\n\n", sep = "")
next
}
conteos <- habitacion %>%
filter(!is.na(.data[[varname]])) %>%
group_by(.data[[varname]]) %>%
summarise(Count = n(), .groups = "drop") %>%
arrange(.data[[varname]])
print(conteos)
cat("\n--------------------------------------------\n\n")
}
## Conteo de valores para la variable: review_scores_cleanliness
## # A tibble: 9 × 2
## review_scores_cleanliness Count
## <dbl> <int>
## 1 2 27
## 2 3 2
## 3 4 21
## 4 5 21
## 5 6 78
## 6 7 150
## 7 8 723
## 8 9 2598
## 9 10 4885
##
## --------------------------------------------
##
## Conteo de valores para la variable: review_scores_location
## # A tibble: 8 × 2
## review_scores_location Count
## <dbl> <int>
## 1 2 7
## 2 4 9
## 3 5 1
## 4 6 28
## 5 7 35
## 6 8 259
## 7 9 1652
## 8 10 6514
##
## --------------------------------------------
##
## Conteo de valores para la variable: review_scores_communication
## # A tibble: 9 × 2
## review_scores_communication Count
## <dbl> <int>
## 1 2 14
## 2 3 2
## 3 4 10
## 4 5 6
## 5 6 37
## 6 7 42
## 7 8 231
## 8 9 1334
## 9 10 6829
##
## --------------------------------------------
##
## Conteo de valores para la variable: review_scores_accuracy
## # A tibble: 8 × 2
## review_scores_accuracy Count
## <dbl> <int>
## 1 2 24
## 2 4 11
## 3 5 16
## 4 6 49
## 5 7 74
## 6 8 401
## 7 9 1973
## 8 10 5957
##
## --------------------------------------------
library(dplyr)
varname <- "review_scores_rating"
breaks <- seq(0, 100, by = 10)
casa_entera <- data_clean %>%
filter(room_type == "Entire home/apt")
conteos_intervalos <- casa_entera %>%
filter(!is.na(.data[[varname]])) %>%
mutate(Interval = cut(.data[[varname]],
breaks = breaks,
right = FALSE,
include.lowest = TRUE)) %>%
group_by(Interval) %>%
summarise(Count = n(), .groups = "drop") %>%
arrange(Interval)
print(conteos_intervalos)
## # A tibble: 8 × 2
## Interval Count
## <fct> <int>
## 1 [20,30) 24
## 2 [30,40) 3
## 3 [40,50) 30
## 4 [50,60) 17
## 5 [60,70) 141
## 6 [70,80) 325
## 7 [80,90) 1679
## 8 [90,100] 5714
library(dplyr)
casa_entera <- data_clean %>%
filter(room_type == "Entire home/apt")
score_vars_apartamentos <- c(
"review_scores_accuracy",
"review_scores_checkin",
"review_scores_communication"
)
results_by_range <- data.frame(
variable = character(),
range = character(),
n = integer(),
cor_spearman = numeric(),
stringsAsFactors = FALSE
)
define_range <- function(score) {
cut(score,
breaks = c(0, 4, 7, 10),
labels = c("1-4", "5-7", "8-10"),
include.lowest = TRUE,
right = TRUE)
}
for (var in score_vars_apartamentos) {
data_temp <- casa_entera %>%
select(price, all_of(var)) %>%
filter(!is.na(.data[[var]]), !is.na(price)) %>%
mutate(range = define_range(.data[[var]]))
summary_by_range <- data_temp %>%
group_by(range) %>%
summarise(
n = n(),
cor_spearman = ifelse(n > 2,
cor(price, .data[[var]], method = "spearman"),
NA_real_),
.groups = "drop"
) %>%
mutate(variable = var) %>%
select(variable, range, n, cor_spearman)
results_by_range <- bind_rows(results_by_range, summary_by_range)
}
print(results_by_range)
## variable range n cor_spearman
## 1 review_scores_accuracy 1-4 44 -0.0781793738
## 2 review_scores_accuracy 5-7 223 0.1258514393
## 3 review_scores_accuracy 8-10 7666 -0.1058683724
## 4 review_scores_checkin 1-4 40 -0.1930436387
## 5 review_scores_checkin 5-7 171 0.0102045821
## 6 review_scores_checkin 8-10 7722 -0.1121871607
## 7 review_scores_communication 1-4 47 -0.0670985963
## 8 review_scores_communication 5-7 131 -0.0009877917
## 9 review_scores_communication 8-10 7755 -0.0911791803
library(dplyr)
library(dplyr)
private_room <- data_clean %>%
filter(room_type == "Private room")
score_vars_habitaciones <- c(
"review_scores_cleanliness",
"review_scores_location",
"review_scores_communication",
"review_scores_accuracy"
)
results_by_range_pr <- data.frame(
variable = character(),
range = character(),
n = integer(),
cor_spearman = numeric(),
stringsAsFactors = FALSE
)
define_range <- function(score) {
cut(score,
breaks = c(0, 4, 7, 10),
labels = c("1-4", "5-7", "8-10"),
include.lowest = TRUE,
right = TRUE)
}
for (var in score_vars_habitaciones) {
data_temp <- private_room %>%
select(price, all_of(var)) %>%
filter(!is.na(.data[[var]]), !is.na(price)) %>%
mutate(range = define_range(.data[[var]]))
summary_by_range <- data_temp %>%
group_by(range) %>%
summarise(
n = n(),
cor_spearman = if (n > 2) cor(price, .data[[var]], method = "spearman") else NA_real_,
.groups = "drop"
) %>%
mutate(variable = var) %>%
select(variable, range, n, cor_spearman)
results_by_range_pr <- bind_rows(results_by_range_pr, summary_by_range)
}
print(results_by_range_pr)
## variable range n cor_spearman
## 1 review_scores_cleanliness 1-4 50 0.11568640
## 2 review_scores_cleanliness 5-7 249 0.08564319
## 3 review_scores_cleanliness 8-10 8206 0.03064383
## 4 review_scores_location 1-4 16 -0.06852852
## 5 review_scores_location 5-7 64 -0.05103583
## 6 review_scores_location 8-10 8425 0.12515108
## 7 review_scores_communication 1-4 26 -0.01391756
## 8 review_scores_communication 5-7 85 0.10412008
## 9 review_scores_communication 8-10 8394 0.04965383
## 10 review_scores_accuracy 1-4 35 0.09759872
## 11 review_scores_accuracy 5-7 139 0.07840985
## 12 review_scores_accuracy 8-10 8331 0.03213290
library(dplyr)
library(fastDummies)
library(pls)
library(tidyr)
library(ggplot2)
base_vars <- c(
"ascensor_normalizado", "garaje_propio_normalizado",
"total_transporte_publico", "total_centros_salud_distrito",
"total_parques_jardines_distrito", "criminalidad_distrito",
"bedrooms", "bathrooms", "accommodates", "beds",
"Las_Ramblas", "Sagrada_Familia", "Barceloneta", "Casa_Batllo",
"room_type_normalizado", "review_scores_rating"
)
score_vars <- c(
"review_scores_accuracy",
"review_scores_checkin",
"review_scores_cleanliness",
"review_scores_location",
"review_scores_communication",
"review_scores_value"
)
vars_all <- c(base_vars, score_vars, "price")
missing <- setdiff(vars_all, names(data_clean))
if(length(missing) > 0) {
stop("Faltan estas columnas en data_clean: ", paste(missing, collapse = ", "))
} else {
message("Todas las variables encontradas correctamente.")
}
## Todas las variables encontradas correctamente.
data_pls <- data_clean %>%
select(all_of(base_vars), all_of(score_vars), price) %>%
filter(if_all(everything(), ~ !is.na(.))) %>%
fastDummies::dummy_cols(
select_columns = score_vars,
remove_selected_columns = TRUE,
remove_first_dummy = TRUE
)
glimpse(data_pls)
## Rows: 16,438
## Columns: 64
## $ ascensor_normalizado <dbl> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,…
## $ garaje_propio_normalizado <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ total_transporte_publico <int> 426, 525, 426, 227, 420, 420, 227, 227…
## $ total_centros_salud_distrito <int> 252, 535, 252, 193, 188, 188, 193, 193…
## $ total_parques_jardines_distrito <int> 54, 116, 54, 23, 45, 45, 23, 23, 43, 2…
## $ criminalidad_distrito <int> 25408, 46754, 25408, 8588, 10057, 1005…
## $ bedrooms <dbl> 2, 3, 1, 1, 1, 1, 1, 3, 1, 2, 1, 4, 1,…
## $ bathrooms <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.5…
## $ accommodates <int> 6, 8, 2, 2, 2, 3, 4, 5, 1, 6, 2, 8, 2,…
## $ beds <dbl> 4, 6, 1, 1, 2, 2, 1, 3, 1, 7, 1, 6, 1,…
## $ Las_Ramblas <dbl> 2.9032765, 2.0839053, 3.2506960, 2.083…
## $ Sagrada_Familia <dbl> 1.0979743, 0.1151974, 2.0057229, 1.513…
## $ Barceloneta <dbl> 3.2360650, 3.1520189, 2.8247701, 3.885…
## $ Casa_Batllo <dbl> 2.5701261, 1.5422143, 3.1794564, 1.287…
## $ room_type_normalizado <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,…
## $ review_scores_rating <dbl> 80, 87, 90, 95, 95, 87, 92, 88, 99, 87…
## $ price <dbl> 130, 60, 33, 45, 42, 53, 75, 85, 30, 1…
## $ review_scores_accuracy_3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_accuracy_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_accuracy_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_accuracy_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_accuracy_7 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_accuracy_8 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ review_scores_accuracy_9 <int> 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,…
## $ review_scores_accuracy_10 <int> 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,…
## $ review_scores_checkin_3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_checkin_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_checkin_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_checkin_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_checkin_7 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ review_scores_checkin_8 <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ review_scores_checkin_9 <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,…
## $ review_scores_checkin_10 <int> 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,…
## $ review_scores_cleanliness_3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_cleanliness_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_cleanliness_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_cleanliness_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_cleanliness_7 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_cleanliness_8 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ review_scores_cleanliness_9 <int> 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,…
## $ review_scores_cleanliness_10 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ review_scores_location_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_location_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_location_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_location_7 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_location_8 <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ review_scores_location_9 <int> 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,…
## $ review_scores_location_10 <int> 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,…
## $ review_scores_communication_3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_communication_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_communication_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_communication_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_communication_7 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_communication_8 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_communication_9 <int> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,…
## $ review_scores_communication_10 <int> 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,…
## $ review_scores_value_3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_value_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_value_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_value_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_value_7 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ review_scores_value_8 <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ review_scores_value_9 <int> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,…
## $ review_scores_value_10 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
modelo_pls <- plsr(
price ~ .,
data = data_pls,
ncomp = 10,
scale = FALSE,
validation = "CV"
)
summary(modelo_pls)
## Data: X dimension: 16438 63
## Y dimension: 16438 1
## Fit method: kernelpls
## Number of components considered: 10
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 78.11 77.63 76.77 76.59 68.53 60.92 60.89
## adjCV 78.11 77.63 76.77 76.59 68.53 60.92 60.89
## 7 comps 8 comps 9 comps 10 comps
## CV 60.32 59.76 59.44 59.36
## adjCV 60.32 59.75 59.44 59.35
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 99.981 99.999 100.000 100.00 100.00 100.0 100.00 100.00
## price 1.251 3.434 3.901 23.09 39.24 39.3 40.48 41.66
## 9 comps 10 comps
## X 100.0 100.00
## price 42.3 42.49
plot(
RMSEP(modelo_pls),
main = "RMSEP vs Número de Componentes",
xlab = "Componentes",
ylab = "RMSEP"
)
opt_comp <- which.min( RMSEP(modelo_pls)$val[1, , -1] )
message("Número óptimo de componentes: ", opt_comp)
## Número óptimo de componentes: 10
final_pls <- plsr(
price ~ .,
data = data_pls,
ncomp = opt_comp,
scale = FALSE
)
load_mat <- loadings(final_pls)[, 1:opt_comp]
load_df <- as.data.frame(load_mat)
load_df$variable <- rownames(load_df)
colnames(load_df)[1:opt_comp] <- paste0("Comp", 1:opt_comp)
print(load_df)
## Comp1 Comp2 Comp3
## ascensor_normalizado 1.560353e-06 8.354808e-04 3.897536e-04
## garaje_propio_normalizado -1.730938e-07 6.586746e-05 1.338389e-04
## total_transporte_publico 1.248327e-03 6.596996e-01 -9.662894e-01
## total_centros_salud_distrito 5.379132e-03 7.386328e-01 7.321230e-01
## total_parques_jardines_distrito 1.236274e-03 1.718554e-01 -4.382512e-02
## criminalidad_distrito 1.000139e+00 -2.259411e-02 -2.678291e-03
## bedrooms 2.281856e-06 9.042129e-04 1.270438e-03
## bathrooms 2.603548e-06 3.683641e-04 1.065041e-03
## accommodates 1.135469e-05 1.714371e-03 3.361622e-03
## beds 6.545166e-06 1.422758e-03 2.416561e-03
## Las_Ramblas -5.625520e-05 1.257642e-03 -3.890812e-03
## Sagrada_Familia -1.707437e-05 -1.074472e-03 -1.214853e-02
## Barceloneta -5.770674e-05 2.804621e-03 6.338692e-04
## Casa_Batllo -3.978649e-05 -3.688362e-04 -8.640159e-03
## room_type_normalizado 2.134349e-06 2.074873e-04 6.659868e-04
## review_scores_rating -1.987404e-05 9.877541e-04 -4.781540e-03
## review_scores_accuracy_3 2.921482e-09 -4.677445e-07 4.367671e-07
## review_scores_accuracy_4 -2.359549e-08 -2.598421e-06 6.493992e-06
## review_scores_accuracy_5 -1.153009e-08 -1.178468e-06 5.534244e-06
## review_scores_accuracy_6 -1.674560e-08 -9.711709e-06 1.120584e-05
## review_scores_accuracy_7 3.224364e-08 5.784325e-06 2.693932e-05
## review_scores_accuracy_8 3.170726e-07 -5.797862e-06 4.697347e-05
## review_scores_accuracy_9 9.132838e-07 4.565136e-05 -3.426786e-06
## review_scores_accuracy_10 -1.222655e-06 -2.464654e-05 -1.105371e-04
## review_scores_checkin_3 -3.979098e-09 -6.104251e-07 2.373085e-06
## review_scores_checkin_4 1.237793e-09 1.081214e-06 4.361687e-06
## review_scores_checkin_5 -5.281331e-09 -2.408004e-06 -8.220990e-07
## review_scores_checkin_6 -8.557346e-08 -2.846647e-06 3.733353e-05
## review_scores_checkin_7 4.798515e-08 1.350947e-06 3.214309e-05
## review_scores_checkin_8 3.157017e-07 1.289547e-05 8.845947e-05
## review_scores_checkin_9 8.415734e-07 4.761228e-05 3.833516e-05
## review_scores_checkin_10 -1.157437e-06 -5.097671e-05 -2.095661e-04
## review_scores_cleanliness_3 -9.154165e-09 1.049824e-06 2.193471e-06
## review_scores_cleanliness_4 -2.660892e-08 -1.710930e-06 7.444155e-06
## review_scores_cleanliness_5 1.310885e-08 -6.256375e-06 4.777790e-06
## review_scores_cleanliness_6 7.672058e-08 -1.278999e-05 2.987177e-05
## review_scores_cleanliness_7 1.948007e-07 -1.651174e-05 1.779063e-05
## review_scores_cleanliness_8 2.987773e-07 -3.704623e-05 -1.569137e-05
## review_scores_cleanliness_9 4.035977e-06 -4.606478e-04 1.339296e-04
## review_scores_cleanliness_10 -4.595667e-06 5.403128e-04 -2.033131e-04
## review_scores_location_4 -1.719633e-08 -2.164880e-06 -2.682212e-06
## review_scores_location_5 -4.117703e-09 -1.765584e-06 -3.912920e-06
## review_scores_location_6 -4.378062e-08 -7.001831e-06 1.451903e-05
## review_scores_location_7 -1.084348e-07 -4.874084e-06 -1.609108e-05
## review_scores_location_8 -8.590953e-07 -1.005680e-05 -6.701355e-05
## review_scores_location_9 -5.407016e-06 1.394072e-04 -7.018107e-04
## review_scores_location_10 6.425267e-06 -1.102123e-04 7.791535e-04
## review_scores_communication_3 1.156754e-10 2.127207e-07 1.870519e-06
## review_scores_communication_4 -1.225712e-08 1.143690e-06 1.230190e-05
## review_scores_communication_5 1.019107e-08 -2.360995e-06 6.607101e-06
## review_scores_communication_6 4.082097e-09 -4.589740e-06 9.788097e-06
## review_scores_communication_7 1.042567e-10 -4.637436e-07 2.156680e-05
## review_scores_communication_8 1.122501e-07 3.148904e-06 4.765101e-05
## review_scores_communication_9 5.619518e-07 1.037834e-04 4.830333e-05
## review_scores_communication_10 -6.909959e-07 -9.503904e-05 -1.532322e-04
## review_scores_value_3 -9.049896e-09 -1.611428e-07 1.517048e-06
## review_scores_value_4 -5.225803e-08 -1.454094e-06 1.520805e-05
## review_scores_value_5 5.616955e-09 -2.149095e-06 6.581978e-06
## review_scores_value_6 -2.373430e-08 -1.001019e-05 2.209503e-05
## review_scores_value_7 8.844671e-08 5.759270e-07 5.502726e-05
## review_scores_value_8 2.331177e-07 -3.048395e-06 6.969706e-05
## review_scores_value_9 7.664906e-07 -2.642469e-06 4.665849e-05
## review_scores_value_10 -1.025981e-06 2.557824e-05 -2.377811e-04
## Comp4 Comp5 Comp6
## ascensor_normalizado -1.010491e-03 5.231180e-03 -1.280300e-03
## garaje_propio_normalizado 9.489046e-04 3.903348e-03 -2.704180e-04
## total_transporte_publico 4.106141e-01 4.109327e-03 1.862510e-01
## total_centros_salud_distrito -3.993122e-01 6.138513e-03 1.603691e-01
## total_parques_jardines_distrito 1.420776e-01 -5.185748e-02 -1.403107e+00
## criminalidad_distrito 1.425182e-03 3.821448e-05 6.346104e-04
## bedrooms 1.222132e-01 9.579992e-02 -2.805529e-03
## bathrooms 3.407707e-02 2.938174e-02 1.601020e-03
## accommodates 3.167334e-01 2.291209e-01 5.805659e-03
## beds 2.510830e-01 1.862128e-01 -4.901968e-03
## Las_Ramblas -1.911569e-02 -1.287814e-02 4.146523e-03
## Sagrada_Familia -2.596534e-03 -1.523609e-02 -3.633562e-02
## Barceloneta -1.357167e-02 -1.581054e-02 -2.770275e-02
## Casa_Batllo -1.496713e-02 -1.339589e-02 2.405149e-03
## room_type_normalizado 5.266146e-02 3.576684e-02 3.591798e-03
## review_scores_rating -1.305332e+00 9.480741e-01 -1.415681e-01
## review_scores_accuracy_3 6.199918e-05 -5.376873e-05 1.043498e-05
## review_scores_accuracy_4 1.305777e-03 -1.125344e-03 1.507133e-04
## review_scores_accuracy_5 1.061315e-03 -8.921948e-04 3.048201e-04
## review_scores_accuracy_6 3.590673e-03 -2.872809e-03 5.048981e-04
## review_scores_accuracy_7 4.492725e-03 -2.940835e-03 6.636581e-04
## review_scores_accuracy_8 1.098929e-02 -6.923602e-03 6.089411e-04
## review_scores_accuracy_9 1.880949e-02 -5.400645e-03 1.215805e-03
## review_scores_accuracy_10 -4.341962e-02 2.313472e-02 -3.754916e-03
## review_scores_checkin_3 8.818170e-05 -8.770234e-05 -3.139821e-05
## review_scores_checkin_4 6.734986e-04 -6.116379e-04 1.140593e-04
## review_scores_checkin_5 7.434841e-04 -5.659750e-04 2.225009e-04
## review_scores_checkin_6 2.403621e-03 -1.763136e-03 2.770082e-04
## review_scores_checkin_7 2.900176e-03 -1.717096e-03 5.093685e-04
## review_scores_checkin_8 9.338518e-03 -4.817365e-03 1.014009e-03
## review_scores_checkin_9 1.818994e-02 -5.068317e-03 2.003300e-03
## review_scores_checkin_10 -3.696206e-02 1.694406e-02 -4.481687e-03
## review_scores_cleanliness_3 1.399854e-04 -1.460928e-04 9.446129e-05
## review_scores_cleanliness_4 1.844766e-03 -1.654912e-03 4.446078e-04
## review_scores_cleanliness_5 1.029877e-03 -9.359810e-04 1.360675e-04
## review_scores_cleanliness_6 3.482370e-03 -3.037131e-03 5.185386e-04
## review_scores_cleanliness_7 3.949578e-03 -3.118926e-03 3.898258e-04
## review_scores_cleanliness_8 1.158601e-02 -9.256296e-03 1.018430e-03
## review_scores_cleanliness_9 1.159923e-02 2.070157e-03 4.259279e-03
## review_scores_cleanliness_10 -3.651158e-02 1.892908e-02 -7.002346e-03
## review_scores_location_4 6.600369e-04 -6.002772e-04 1.146184e-04
## review_scores_location_5 3.703927e-04 -2.735391e-04 8.281091e-05
## review_scores_location_6 1.977541e-03 -1.811773e-03 1.903132e-04
## review_scores_location_7 1.536108e-03 -1.086940e-03 2.691245e-04
## review_scores_location_8 6.087717e-03 -4.653133e-03 1.267404e-03
## review_scores_location_9 1.320598e-02 -4.577511e-03 1.751944e-03
## review_scores_location_10 -2.530197e-02 1.428575e-02 -3.836750e-03
## review_scores_communication_3 1.559925e-04 -1.610826e-04 5.781088e-05
## review_scores_communication_4 1.195247e-03 -1.019802e-03 3.449526e-04
## review_scores_communication_5 7.989895e-04 -4.967535e-04 9.028438e-05
## review_scores_communication_6 2.419063e-03 -2.035026e-03 2.394990e-04
## review_scores_communication_7 2.152112e-03 -1.651698e-03 4.921748e-04
## review_scores_communication_8 8.326623e-03 -5.401499e-03 8.804750e-04
## review_scores_communication_9 1.933055e-02 -6.974225e-03 2.015777e-03
## review_scores_communication_10 -3.693556e-02 1.990429e-02 -4.403871e-03
## review_scores_value_3 2.427226e-04 -2.235250e-04 1.235418e-04
## review_scores_value_4 1.693939e-03 -1.511706e-03 2.862913e-04
## review_scores_value_5 1.195109e-03 -1.070555e-03 2.497196e-04
## review_scores_value_6 4.266185e-03 -3.440723e-03 7.254224e-04
## review_scores_value_7 6.156756e-03 -3.911903e-03 8.316791e-04
## review_scores_value_8 1.806407e-02 -8.879299e-03 1.681792e-03
## review_scores_value_9 -2.810491e-03 1.062676e-02 7.740030e-05
## review_scores_value_10 -3.247264e-02 1.171374e-02 -4.274988e-03
## Comp7 Comp8 Comp9
## ascensor_normalizado 2.756177e-02 1.280476e-01 2.599265e-01
## garaje_propio_normalizado -9.485534e-03 -1.153762e-03 -9.030801e-03
## total_transporte_publico -8.876027e-02 -5.641602e-03 7.215668e-04
## total_centros_salud_distrito -8.515219e-02 7.637255e-03 -6.964378e-03
## total_parques_jardines_distrito 7.162696e-01 -1.814222e-02 2.273777e-02
## criminalidad_distrito -3.918595e-04 6.109828e-05 3.361498e-06
## bedrooms -7.259966e-02 -1.537928e-02 7.162530e-02
## bathrooms 1.734819e-02 9.411631e-02 2.704015e-01
## accommodates 5.274357e-02 3.506668e-01 -3.069212e-01
## beds -2.531682e-01 -3.708858e-01 2.531306e-01
## Las_Ramblas -5.370102e-01 4.057452e-01 2.837817e-01
## Sagrada_Familia -2.115690e-01 5.939152e-01 -7.472923e-01
## Barceloneta -4.067225e-01 4.128159e-01 -2.809214e-01
## Casa_Batllo -4.834531e-01 4.240515e-01 1.445728e-01
## room_type_normalizado 8.317401e-02 2.027800e-01 2.444512e-01
## review_scores_rating 5.714640e-02 1.381732e-02 -2.649987e-02
## review_scores_accuracy_3 -9.669303e-06 1.845766e-05 5.951284e-05
## review_scores_accuracy_4 -3.578824e-04 -2.206516e-04 2.935232e-03
## review_scores_accuracy_5 -3.782090e-06 -8.954765e-04 1.914540e-03
## review_scores_accuracy_6 6.495732e-05 -1.957544e-03 6.950223e-03
## review_scores_accuracy_7 -8.972080e-04 3.996593e-04 4.893905e-03
## review_scores_accuracy_8 1.978603e-03 1.604582e-02 1.310946e-02
## review_scores_accuracy_9 1.063342e-02 1.407186e-02 -2.474423e-01
## review_scores_accuracy_10 -1.050112e-02 -2.531508e-02 2.115587e-01
## review_scores_checkin_3 -2.346726e-05 1.593520e-05 3.148835e-06
## review_scores_checkin_4 -1.829573e-04 4.136479e-04 1.348912e-03
## review_scores_checkin_5 -1.575448e-04 -3.247252e-04 1.671375e-03
## review_scores_checkin_6 8.051739e-04 3.031937e-04 5.930493e-03
## review_scores_checkin_7 1.165458e-03 1.948910e-03 2.407849e-03
## review_scores_checkin_8 5.854184e-03 2.517190e-02 4.017052e-02
## review_scores_checkin_9 3.176933e-02 7.968878e-02 -7.476121e-02
## review_scores_checkin_10 -3.873681e-02 -1.049673e-01 1.839889e-02
## review_scores_cleanliness_3 7.946902e-05 -1.915046e-04 1.088969e-04
## review_scores_cleanliness_4 -2.309581e-04 -2.228202e-03 3.885680e-03
## review_scores_cleanliness_5 -3.741056e-04 -5.422664e-04 1.715653e-03
## review_scores_cleanliness_6 -9.222822e-04 -7.421857e-04 4.933456e-03
## review_scores_cleanliness_7 -8.527692e-04 -1.665469e-03 -1.055730e-03
## review_scores_cleanliness_8 -4.497815e-03 -3.899911e-03 -6.592953e-02
## review_scores_cleanliness_9 8.171044e-03 -1.006380e-02 -2.216310e-01
## review_scores_cleanliness_10 -4.283227e-04 2.152284e-02 2.714353e-01
## review_scores_location_4 -5.606431e-04 1.151681e-05 1.475539e-03
## review_scores_location_5 -1.245871e-04 3.112013e-04 9.354337e-04
## review_scores_location_6 -1.266500e-03 3.376880e-04 5.313929e-03
## review_scores_location_7 -3.315714e-03 2.422469e-03 1.053345e-03
## review_scores_location_8 -1.038863e-02 1.720689e-02 1.707044e-02
## review_scores_location_9 -4.460955e-02 1.090676e-01 -4.044031e-02
## review_scores_location_10 6.054577e-02 -1.281872e-01 1.227010e-02
## review_scores_communication_3 -2.964186e-05 -2.356922e-04 5.714437e-05
## review_scores_communication_4 -2.903289e-04 -3.932843e-05 1.964334e-03
## review_scores_communication_5 -5.547502e-05 -6.177366e-04 1.457820e-03
## review_scores_communication_6 5.045290e-04 -1.044803e-04 7.523259e-03
## review_scores_communication_7 6.202028e-04 1.046934e-03 4.026974e-03
## review_scores_communication_8 2.498945e-03 1.513061e-02 1.337665e-02
## review_scores_communication_9 1.766327e-02 6.580225e-02 -1.260525e-01
## review_scores_communication_10 -2.032710e-02 -7.855409e-02 9.338033e-02
## review_scores_value_3 1.009869e-04 3.439611e-05 4.712926e-04
## review_scores_value_4 -4.175898e-04 -1.401642e-03 2.662723e-03
## review_scores_value_5 1.600261e-04 -3.070991e-04 2.391266e-03
## review_scores_value_6 -6.212819e-04 -1.071948e-03 7.471682e-03
## review_scores_value_7 -2.953556e-04 2.665613e-03 4.215805e-03
## review_scores_value_8 1.318618e-02 4.585982e-02 3.811315e-03
## review_scores_value_9 6.588961e-03 -2.311134e-03 -6.956296e-02
## review_scores_value_10 -1.757020e-02 -4.083800e-02 4.091669e-02
## Comp10 variable
## ascensor_normalizado -3.004684e-02 ascensor_normalizado
## garaje_propio_normalizado -4.128112e-02 garaje_propio_normalizado
## total_transporte_publico 1.020255e-03 total_transporte_publico
## total_centros_salud_distrito 5.335345e-03 total_centros_salud_distrito
## total_parques_jardines_distrito -2.318524e-02 total_parques_jardines_distrito
## criminalidad_distrito -1.069924e-05 criminalidad_distrito
## bedrooms -1.748764e-02 bedrooms
## bathrooms 1.747752e-01 bathrooms
## accommodates -3.788100e-01 accommodates
## beds 4.077904e-01 beds
## Las_Ramblas -3.747952e-01 Las_Ramblas
## Sagrada_Familia 6.537305e-01 Sagrada_Familia
## Barceloneta 2.195842e-01 Barceloneta
## Casa_Batllo -2.838714e-01 Casa_Batllo
## room_type_normalizado 1.106933e-01 room_type_normalizado
## review_scores_rating 2.063847e-02 review_scores_rating
## review_scores_accuracy_3 -7.206050e-05 review_scores_accuracy_3
## review_scores_accuracy_4 -1.308850e-03 review_scores_accuracy_4
## review_scores_accuracy_5 -1.310094e-03 review_scores_accuracy_5
## review_scores_accuracy_6 -3.716728e-03 review_scores_accuracy_6
## review_scores_accuracy_7 -7.847232e-04 review_scores_accuracy_7
## review_scores_accuracy_8 1.268857e-02 review_scores_accuracy_8
## review_scores_accuracy_9 1.462525e-01 review_scores_accuracy_9
## review_scores_accuracy_10 -1.457333e-01 review_scores_accuracy_10
## review_scores_checkin_3 -1.700640e-04 review_scores_checkin_3
## review_scores_checkin_4 -1.176509e-03 review_scores_checkin_4
## review_scores_checkin_5 -1.402292e-03 review_scores_checkin_5
## review_scores_checkin_6 -9.604273e-04 review_scores_checkin_6
## review_scores_checkin_7 -9.386506e-04 review_scores_checkin_7
## review_scores_checkin_8 3.611715e-02 review_scores_checkin_8
## review_scores_checkin_9 8.555275e-02 review_scores_checkin_9
## review_scores_checkin_10 -1.112515e-01 review_scores_checkin_10
## review_scores_cleanliness_3 -6.337693e-05 review_scores_cleanliness_3
## review_scores_cleanliness_4 -3.707600e-03 review_scores_cleanliness_4
## review_scores_cleanliness_5 -2.333350e-03 review_scores_cleanliness_5
## review_scores_cleanliness_6 -3.674515e-03 review_scores_cleanliness_6
## review_scores_cleanliness_7 -4.235576e-03 review_scores_cleanliness_7
## review_scores_cleanliness_8 -3.452525e-02 review_scores_cleanliness_8
## review_scores_cleanliness_9 2.377789e-01 review_scores_cleanliness_9
## review_scores_cleanliness_10 -1.850428e-01 review_scores_cleanliness_10
## review_scores_location_4 -8.997494e-04 review_scores_location_4
## review_scores_location_5 -7.548728e-04 review_scores_location_5
## review_scores_location_6 -4.204794e-03 review_scores_location_6
## review_scores_location_7 -7.686865e-04 review_scores_location_7
## review_scores_location_8 3.931926e-03 review_scores_location_8
## review_scores_location_9 3.137151e-02 review_scores_location_9
## review_scores_location_10 -2.558760e-02 review_scores_location_10
## review_scores_communication_3 -1.343624e-04 review_scores_communication_3
## review_scores_communication_4 -1.609227e-03 review_scores_communication_4
## review_scores_communication_5 -1.317382e-03 review_scores_communication_5
## review_scores_communication_6 -2.548217e-03 review_scores_communication_6
## review_scores_communication_7 -1.371914e-03 review_scores_communication_7
## review_scores_communication_8 3.431652e-03 review_scores_communication_8
## review_scores_communication_9 6.723359e-02 review_scores_communication_9
## review_scores_communication_10 -5.863083e-02 review_scores_communication_10
## review_scores_value_3 -3.116021e-04 review_scores_value_3
## review_scores_value_4 -2.813030e-03 review_scores_value_4
## review_scores_value_5 -1.416238e-03 review_scores_value_5
## review_scores_value_6 -5.105318e-03 review_scores_value_6
## review_scores_value_7 9.615674e-05 review_scores_value_7
## review_scores_value_8 3.934613e-02 review_scores_value_8
## review_scores_value_9 9.771013e-02 review_scores_value_9
## review_scores_value_10 -1.208991e-01 review_scores_value_10
library(dplyr)
library(tidyr)
library(ggplot2)
load_long8 <- load_df %>%
pivot_longer(
cols = starts_with("Comp"),
names_to = "Component",
values_to = "Loading"
) %>%
filter(Component %in% paste0("Comp", 1:8))
top_vars <- load_long8 %>%
group_by(variable) %>%
summarize(total = sum(abs(Loading))) %>%
slice_max(order_by = total, n = 30) %>%
pull(variable)
heat_df <- load_long8 %>%
filter(variable %in% top_vars)
ggplot(heat_df, aes(x = Component, y = variable, fill = Loading)) +
geom_tile() +
scale_fill_gradient2(
low = "steelblue",
mid = "white",
high = "firebrick",
midpoint = 0
) +
labs(
title = "Heatmap de cargas PLS (Comp1–Comp8)",
x = "Componente",
y = NULL,
fill = "Loading"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 7)
)
Aqui observamos algo interesante , observamos que en la gran mayoria de los casos los precios caros se encuentran en puntuaciones medias- altas. No obstante , la muestra esta tan extremadamente sesgada que no podemos afirmar con seguridad ni demostrar que esto seria asi con una muestra mas homogenea.
Añadimos las variables al modelo del objetivo 2 añadiendo las variables implementadas en el objetivo 3 y comparamos la variabilidad de precio explicada entre los dos modelos para ver si estas nuevas variables aportan algo o no .
modelo_log3 <- lm(log(price) ~
ascensor + tiene_parking + adicionales + Las_Ramblas +
n_amenities + bedrooms + bathrooms + accommodates + room_type +
reviews_per_month + review_scores_rating + review_scores_accuracy +
review_scores_checkin + review_scores_communication +
review_scores_location + review_scores_value +
total_transporte_publico + total_centros_salud_distrito +
total_parques_jardines_distrito + criminalidad_distrito + beds,
data = data_clean)
summary(modelo_log3)
##
## Call:
## lm(formula = log(price) ~ ascensor + tiene_parking + adicionales +
## Las_Ramblas + n_amenities + bedrooms + bathrooms + accommodates +
## room_type + reviews_per_month + review_scores_rating + review_scores_accuracy +
## review_scores_checkin + review_scores_communication + review_scores_location +
## review_scores_value + total_transporte_publico + total_centros_salud_distrito +
## total_parques_jardines_distrito + criminalidad_distrito +
## beds, data = data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.04752 -0.32575 -0.02015 0.30014 2.68081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.892e+00 7.153e-02 54.402 < 2e-16 ***
## ascensorTRUE 6.423e-02 8.708e-03 7.376 1.71e-13 ***
## tiene_parkingTRUE 1.037e-02 1.291e-02 0.803 0.421767
## adicionalesTRUE 9.797e-02 2.652e-02 3.695 0.000221 ***
## Las_Ramblas -7.663e-02 4.769e-03 -16.070 < 2e-16 ***
## n_amenities 1.198e-03 3.920e-04 3.057 0.002243 **
## bedrooms -2.654e-03 7.719e-03 -0.344 0.730951
## bathrooms 4.248e-02 8.965e-03 4.738 2.18e-06 ***
## accommodates 1.553e-01 4.551e-03 34.122 < 2e-16 ***
## room_typePrivate room -4.548e-01 1.128e-02 -40.330 < 2e-16 ***
## reviews_per_month 5.322e-03 2.731e-03 1.948 0.051397 .
## review_scores_rating 6.206e-03 9.292e-04 6.679 2.48e-11 ***
## review_scores_accuracy 2.911e-06 7.621e-03 0.000 0.999695
## review_scores_checkin -5.386e-02 7.458e-03 -7.222 5.34e-13 ***
## review_scores_communication 1.859e-02 7.844e-03 2.370 0.017790 *
## review_scores_location -6.645e-03 7.325e-03 -0.907 0.364336
## review_scores_value -2.266e-02 7.365e-03 -3.077 0.002094 **
## total_transporte_publico 3.286e-04 8.814e-05 3.728 0.000194 ***
## total_centros_salud_distrito 5.015e-04 7.604e-05 6.596 4.35e-11 ***
## total_parques_jardines_distrito -2.594e-03 4.988e-04 -5.200 2.02e-07 ***
## criminalidad_distrito 1.469e-07 4.625e-07 0.317 0.750880
## beds -1.639e-02 4.995e-03 -3.280 0.001040 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4991 on 16416 degrees of freedom
## Multiple R-squared: 0.5296, Adjusted R-squared: 0.5289
## F-statistic: 879.9 on 21 and 16416 DF, p-value: < 2.2e-16
Como era de esperar , las variables no aportan nada , muy probablemente debido a lo extremadamente sesgada que esta la muestra.
summary(modelo_log2)$r.squared
## [1] 0.5243242
summary(modelo_log3)$r.squared
## [1] 0.5295506
pkgs <- c("caret", "FNN", "Metrics", "yardstick",
"dplyr", "ggplot2", "tibble", "glue")
to_install <- setdiff(pkgs, installed.packages()[,"Package"])
if (length(to_install) > 0) install.packages(to_install, dependencies = TRUE)
lapply(pkgs, library, character.only = TRUE)
## Warning: package 'yardstick' was built under R version 4.4.3
##
## Adjuntando el paquete: 'yardstick'
## The following objects are masked from 'package:Metrics':
##
## accuracy, mae, mape, mase, precision, recall, rmse, smape
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:readr':
##
## spec
## [[1]]
## [1] "broom" "tidygeocoder" "leaflet" "gridExtra" "factoextra"
## [6] "viridis" "viridisLite" "sf" "fastDummies" "geosphere"
## [11] "naniar" "dials" "scales" "parsnip" "tune"
## [16] "workflows" "recipes" "rsample" "pls" "future"
## [21] "Matrix" "xgboost" "randomForest" "Metrics" "FNN"
## [26] "caret" "lattice" "class" "knitr" "jsonlite"
## [31] "httr" "pander" "lubridate" "forcats" "stringr"
## [36] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [41] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [46] "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "broom" "tidygeocoder" "leaflet" "gridExtra" "factoextra"
## [6] "viridis" "viridisLite" "sf" "fastDummies" "geosphere"
## [11] "naniar" "dials" "scales" "parsnip" "tune"
## [16] "workflows" "recipes" "rsample" "pls" "future"
## [21] "Matrix" "xgboost" "randomForest" "Metrics" "FNN"
## [26] "caret" "lattice" "class" "knitr" "jsonlite"
## [31] "httr" "pander" "lubridate" "forcats" "stringr"
## [36] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [41] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [46] "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "broom" "tidygeocoder" "leaflet" "gridExtra" "factoextra"
## [6] "viridis" "viridisLite" "sf" "fastDummies" "geosphere"
## [11] "naniar" "dials" "scales" "parsnip" "tune"
## [16] "workflows" "recipes" "rsample" "pls" "future"
## [21] "Matrix" "xgboost" "randomForest" "Metrics" "FNN"
## [26] "caret" "lattice" "class" "knitr" "jsonlite"
## [31] "httr" "pander" "lubridate" "forcats" "stringr"
## [36] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [41] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [46] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "yardstick" "broom" "tidygeocoder" "leaflet" "gridExtra"
## [6] "factoextra" "viridis" "viridisLite" "sf" "fastDummies"
## [11] "geosphere" "naniar" "dials" "scales" "parsnip"
## [16] "tune" "workflows" "recipes" "rsample" "pls"
## [21] "future" "Matrix" "xgboost" "randomForest" "Metrics"
## [26] "FNN" "caret" "lattice" "class" "knitr"
## [31] "jsonlite" "httr" "pander" "lubridate" "forcats"
## [36] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [41] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [46] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "yardstick" "broom" "tidygeocoder" "leaflet" "gridExtra"
## [6] "factoextra" "viridis" "viridisLite" "sf" "fastDummies"
## [11] "geosphere" "naniar" "dials" "scales" "parsnip"
## [16] "tune" "workflows" "recipes" "rsample" "pls"
## [21] "future" "Matrix" "xgboost" "randomForest" "Metrics"
## [26] "FNN" "caret" "lattice" "class" "knitr"
## [31] "jsonlite" "httr" "pander" "lubridate" "forcats"
## [36] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [41] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [46] "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "yardstick" "broom" "tidygeocoder" "leaflet" "gridExtra"
## [6] "factoextra" "viridis" "viridisLite" "sf" "fastDummies"
## [11] "geosphere" "naniar" "dials" "scales" "parsnip"
## [16] "tune" "workflows" "recipes" "rsample" "pls"
## [21] "future" "Matrix" "xgboost" "randomForest" "Metrics"
## [26] "FNN" "caret" "lattice" "class" "knitr"
## [31] "jsonlite" "httr" "pander" "lubridate" "forcats"
## [36] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [41] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [46] "grDevices" "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "yardstick" "broom" "tidygeocoder" "leaflet" "gridExtra"
## [6] "factoextra" "viridis" "viridisLite" "sf" "fastDummies"
## [11] "geosphere" "naniar" "dials" "scales" "parsnip"
## [16] "tune" "workflows" "recipes" "rsample" "pls"
## [21] "future" "Matrix" "xgboost" "randomForest" "Metrics"
## [26] "FNN" "caret" "lattice" "class" "knitr"
## [31] "jsonlite" "httr" "pander" "lubridate" "forcats"
## [36] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [41] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [46] "grDevices" "utils" "datasets" "methods" "base"
##
## [[8]]
## [1] "glue" "yardstick" "broom" "tidygeocoder" "leaflet"
## [6] "gridExtra" "factoextra" "viridis" "viridisLite" "sf"
## [11] "fastDummies" "geosphere" "naniar" "dials" "scales"
## [16] "parsnip" "tune" "workflows" "recipes" "rsample"
## [21] "pls" "future" "Matrix" "xgboost" "randomForest"
## [26] "Metrics" "FNN" "caret" "lattice" "class"
## [31] "knitr" "jsonlite" "httr" "pander" "lubridate"
## [36] "forcats" "stringr" "dplyr" "purrr" "readr"
## [41] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [46] "graphics" "grDevices" "utils" "datasets" "methods"
## [51] "base"
Definimos las variables predictoras que se utilizarán para los modelos.
vars_pred <- c(
"accommodates","bedrooms","bathrooms","beds","n_amenities",
"room_type_normalizado","ascensor_normalizado","garaje_propio_normalizado",
"adicionales_normalizado","Las_Ramblas","total_transporte_publico",
"total_centros_salud_distrito","total_parques_jardines_distrito",
"criminalidad_distrito",
"availability_30","availability_60","availability_90","availability_365",
"minimum_nights","maximum_nights",
"host_listings_count"
)
Preparamos el conjunto de datos eliminando valores NA y escalando los predictores.
knn_df <- data_clean %>%
select(all_of(vars_pred), price) %>%
drop_na()
preproc <- preProcess(knn_df[, vars_pred], method = c("center", "scale"))
X_scaled <- predict(preproc, knn_df[, vars_pred])
y <- knn_df$price
set.seed(42)
train_id <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X_scaled[train_id, ]
X_test <- X_scaled[-train_id, ]
y_train <- y[train_id]
y_test <- y[-train_id]
Realizamos una validación cruzada para encontrar el mejor valor de k.
k_vals <- 3:35
rmse_cv <- numeric(length(k_vals))
folds <- createFolds(y_train, k = 5, returnTrain = TRUE)
for (i in seq_along(k_vals)) {
k <- k_vals[i]
rmse_fold <- numeric(length(folds))
for (j in seq_along(folds)) {
id_tr <- folds[[j]]
id_te <- setdiff(seq_along(y_train), id_tr)
pred_cv <- knn.reg(
train = X_train[id_tr, ], test = X_train[id_te, ],
y = y_train[id_tr], k = k)$pred
rmse_fold[j] <- rmse_vec(y_train[id_te], pred_cv)
}
rmse_cv[i] <- mean(rmse_fold)
}
best_k <- k_vals[which.min(rmse_cv)]
print(glue("Mejor k según CV: {best_k} (RMSE {round(min(rmse_cv),2)} €)"))
## Mejor k según CV: 8 (RMSE 50.53 €)
ggplot(data.frame(k = k_vals, RMSE = rmse_cv),
aes(k, RMSE)) +
geom_line() + geom_point() +
geom_vline(xintercept = best_k, linetype = "dashed") +
labs(title = "Búsqueda del k óptimo (CV 5-fold)",
x = "Número de vecinos (k)", y = "RMSE (€)")
Evaluamos el modelo en el conjunto de test, en euros.
pred_eur <- knn.reg(
train = X_train, test = X_test, y = y_train, k = best_k)$pred
rmse_eur <- rmse_vec(y_test, pred_eur)
mae_eur <- mae_vec (y_test, pred_eur)
cat("RMSE:", round(rmse_eur,2),
"| MAE:", round(mae_eur,2), "€\n")
## RMSE: 49.6 | MAE: 28.85 €
Evaluamos el modelo con la transformación logarítmica y reconversión a euros.
y_train_log <- log10(y_train)
y_test_log <- log10(y_test)
pred_log <- knn.reg(
train = X_train, test = X_test,
y = y_train_log, k = best_k)$pred
rmse_log <- rmse_vec(y_test_log, pred_log)
mae_log <- mae_vec (y_test_log, pred_log)
pred_eur_from_log <- 10 ^ pred_log
rmse_eur_from_log <- rmse_vec(y_test, pred_eur_from_log)
mae_eur_from_log <- mae_vec (y_test, pred_eur_from_log)
mult <- 10 ^ rmse_log
cat(glue::glue(
" ↳ RMSE(€): {round(rmse_eur_from_log,2)} €\n",
" ↳ MAE(€): {round(mae_eur_from_log ,2)} €\n"
))
## ↳ RMSE(€): 50.42 €
## ↳ MAE(€): 27.91 €
Filtramos el conjunto para propiedades con precio menor o igual a 300 € y evaluamos.
sel <- knn_df$price <= 300
X_filt <- X_scaled[sel, ]
y_filt <- y[sel]
set.seed(42)
id_tr2 <- createDataPartition(y_filt, p = 0.8, list = FALSE)
X_tr2 <- X_filt[id_tr2, ]; X_te2 <- X_filt[-id_tr2, ]
y_tr2 <- y_filt[id_tr2]; y_te2 <- y_filt[-id_tr2]
pred_300 <- knn.reg(train = X_tr2, test = X_te2,
y = y_tr2, k = best_k)$pred
rmse_300 <- rmse_vec(y_te2, pred_300)
mae_300 <- mae_vec (y_te2, pred_300)
cat("RMSE:", round(rmse_300,2),
"| MAE:", round(mae_300,2), "€\n")
## RMSE: 37.27 | MAE: 25.13 €
Aplicamos log10 para el conjunto filtrado y reconvertimos a euros.
y_tr2_log <- log10(y_tr2)
y_te2_log <- log10(y_te2)
pred_log_300 <- knn.reg(
train = X_tr2, test = X_te2,
y = y_tr2_log, k = best_k)$pred
rmse_log_300 <- rmse_vec(y_te2_log, pred_log_300)
mae_log_300 <- mae_vec (y_te2_log, pred_log_300)
pred_eur_300 <- 10 ^ pred_log_300
rmse_eur_300 <- rmse_vec(y_te2, pred_eur_300)
mae_eur_300 <- mae_vec (y_te2, pred_eur_300)
mult_300 <- 10 ^ rmse_log_300
cat(glue::glue(
"RMSE(log): {round(rmse_log_300,3)} (×{round(mult_300,2)})\n",
"RMSE(€): {round(rmse_eur_300,2)} € | MAE(€): {round(mae_eur_300,2)} €\n"
))
## RMSE(log): 0.189 (×1.54)
## RMSE(€): 37.61 € | MAE(€): 24.38 €
Construimos la tabla resumen de resultados en diferentes escenarios.
results_knn <- tibble(
Escenario = c("€", "log→€", "€ ≤300", "log→€ ≤300"),
RMSE = c(rmse_eur, rmse_eur_from_log, rmse_300, rmse_eur_300),
MAE = c(mae_eur, mae_eur_from_log, mae_300, mae_eur_300),
Media_Precio = c(mean(y_test), mean(y_test), mean(y_te2), mean(y_te2))
) %>%
mutate(
MAPE = 100 * c(
mape_vec(y_test, pred_eur),
mape_vec(y_test, pred_eur_from_log),
mape_vec(y_te2, pred_300),
mape_vec(y_te2, pred_eur_300)
),
NRMSE = 100 * RMSE / Media_Precio,
NMAE = 100 * MAE / Media_Precio
) %>%
select(Escenario, RMSE, MAE, MAPE, NRMSE, NMAE)
print(results_knn)
## # A tibble: 4 × 6
## Escenario RMSE MAE MAPE NRMSE NMAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 € 49.6 28.9 4158. 56.5 32.9
## 2 log→€ 50.4 27.9 3575. 57.4 31.8
## 3 € ≤300 37.3 25.1 3988. 47.4 31.9
## 4 log→€ ≤300 37.6 24.4 3537. 47.8 31.0
Visualizamos los resultados de RMSE absoluto y normalizado por escenario.
ggplot(results_knn, aes(Escenario, RMSE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = round(RMSE,2)), vjust = -0.3, size = 3.8) +
labs(title = "k-NN · RMSE absoluto (€)", y = "RMSE (€)", x = "") +
theme_minimal()
ggplot(results_knn, aes(Escenario, MAE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = round(MAE,2)), vjust = -0.3, size = 3.8) +
labs(title = "k-NN · MAE absoluto (€)", y = "MAE (€)", x = "") +
theme_minimal()
ggplot(results_knn, aes(Escenario, NRMSE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = paste0(round(NRMSE,1), "%")),
vjust = -0.3, size = 3.8) +
labs(title = "k-NN · RMSE normalizado (%)",
y = "RMSE / Precio medio (%)", x = "") +
theme_minimal()
ggplot(results_knn, aes(Escenario, NMAE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = paste0(round(NMAE,1), "%")),
vjust = -0.3, size = 3.8) +
labs(title = "k-NN · MAE normalizado (%)",
y = "MAE / Precio medio (%)", x = "") +
theme_minimal()
Creamos el conjunto de datos base y lo separamos en entrenamiento y test.
rf_df <- data_clean %>% select(all_of(vars_pred), price) %>% drop_na()
set.seed(42)
split_rf <- initial_split(rf_df, prop = 0.8, strata = price)
train_rf <- training(split_rf)
test_rf <- testing(split_rf)
Creamos la receta base aplicando log-transformaciones y normalización.
rec_rf <- recipe(price ~ ., data = train_rf) %>%
step_log(host_listings_count, availability_30, availability_60,
availability_90, availability_365, offset = 1) %>%
step_normalize(all_numeric_predictors()) %>%
step_zv(all_predictors())
Definimos el modelo Random Forest y una malla de hiperparámetros aleatorios.
rf_spec <- rand_forest(trees = 800, mtry = tune(), min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger", importance = "permutation")
rf_grid <- grid_random(
mtry(range = c(3, 14)),
min_n(range = c(2, 20)),
size = 10
)
Entrenamos el modelo usando validación cruzada de 3 folds y seleccionamos los mejores hiperparámetros.
set.seed(42)
rf_tuned <- tune_grid(
workflow(rec_rf, rf_spec),
resamples = vfold_cv(train_rf, v = 3),
grid = rf_grid,
metrics = metric_set(yardstick::rmse),
control = control_grid(verbose = TRUE, allow_par = FALSE)
)
## Warning: package 'ranger' was built under R version 4.4.3
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/9
## ✓ Fold1: preprocessor 1/1, model 1/9
## i Fold1: preprocessor 1/1, model 1/9 (extracts)
## i Fold1: preprocessor 1/1, model 1/9 (predictions)
## i Fold1: preprocessor 1/1, model 2/9
## ✓ Fold1: preprocessor 1/1, model 2/9
## i Fold1: preprocessor 1/1, model 2/9 (extracts)
## i Fold1: preprocessor 1/1, model 2/9 (predictions)
## i Fold1: preprocessor 1/1, model 3/9
## ✓ Fold1: preprocessor 1/1, model 3/9
## i Fold1: preprocessor 1/1, model 3/9 (extracts)
## i Fold1: preprocessor 1/1, model 3/9 (predictions)
## i Fold1: preprocessor 1/1, model 4/9
## ✓ Fold1: preprocessor 1/1, model 4/9
## i Fold1: preprocessor 1/1, model 4/9 (extracts)
## i Fold1: preprocessor 1/1, model 4/9 (predictions)
## i Fold1: preprocessor 1/1, model 5/9
## ✓ Fold1: preprocessor 1/1, model 5/9
## i Fold1: preprocessor 1/1, model 5/9 (extracts)
## i Fold1: preprocessor 1/1, model 5/9 (predictions)
## i Fold1: preprocessor 1/1, model 6/9
## ✓ Fold1: preprocessor 1/1, model 6/9
## i Fold1: preprocessor 1/1, model 6/9 (extracts)
## i Fold1: preprocessor 1/1, model 6/9 (predictions)
## i Fold1: preprocessor 1/1, model 7/9
## ✓ Fold1: preprocessor 1/1, model 7/9
## i Fold1: preprocessor 1/1, model 7/9 (extracts)
## i Fold1: preprocessor 1/1, model 7/9 (predictions)
## i Fold1: preprocessor 1/1, model 8/9
## ✓ Fold1: preprocessor 1/1, model 8/9
## i Fold1: preprocessor 1/1, model 8/9 (extracts)
## i Fold1: preprocessor 1/1, model 8/9 (predictions)
## i Fold1: preprocessor 1/1, model 9/9
## ✓ Fold1: preprocessor 1/1, model 9/9
## i Fold1: preprocessor 1/1, model 9/9 (extracts)
## i Fold1: preprocessor 1/1, model 9/9 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/9
## ✓ Fold2: preprocessor 1/1, model 1/9
## i Fold2: preprocessor 1/1, model 1/9 (extracts)
## i Fold2: preprocessor 1/1, model 1/9 (predictions)
## i Fold2: preprocessor 1/1, model 2/9
## ✓ Fold2: preprocessor 1/1, model 2/9
## i Fold2: preprocessor 1/1, model 2/9 (extracts)
## i Fold2: preprocessor 1/1, model 2/9 (predictions)
## i Fold2: preprocessor 1/1, model 3/9
## ✓ Fold2: preprocessor 1/1, model 3/9
## i Fold2: preprocessor 1/1, model 3/9 (extracts)
## i Fold2: preprocessor 1/1, model 3/9 (predictions)
## i Fold2: preprocessor 1/1, model 4/9
## ✓ Fold2: preprocessor 1/1, model 4/9
## i Fold2: preprocessor 1/1, model 4/9 (extracts)
## i Fold2: preprocessor 1/1, model 4/9 (predictions)
## i Fold2: preprocessor 1/1, model 5/9
## ✓ Fold2: preprocessor 1/1, model 5/9
## i Fold2: preprocessor 1/1, model 5/9 (extracts)
## i Fold2: preprocessor 1/1, model 5/9 (predictions)
## i Fold2: preprocessor 1/1, model 6/9
## ✓ Fold2: preprocessor 1/1, model 6/9
## i Fold2: preprocessor 1/1, model 6/9 (extracts)
## i Fold2: preprocessor 1/1, model 6/9 (predictions)
## i Fold2: preprocessor 1/1, model 7/9
## ✓ Fold2: preprocessor 1/1, model 7/9
## i Fold2: preprocessor 1/1, model 7/9 (extracts)
## i Fold2: preprocessor 1/1, model 7/9 (predictions)
## i Fold2: preprocessor 1/1, model 8/9
## ✓ Fold2: preprocessor 1/1, model 8/9
## i Fold2: preprocessor 1/1, model 8/9 (extracts)
## i Fold2: preprocessor 1/1, model 8/9 (predictions)
## i Fold2: preprocessor 1/1, model 9/9
## ✓ Fold2: preprocessor 1/1, model 9/9
## i Fold2: preprocessor 1/1, model 9/9 (extracts)
## i Fold2: preprocessor 1/1, model 9/9 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/9
## ✓ Fold3: preprocessor 1/1, model 1/9
## i Fold3: preprocessor 1/1, model 1/9 (extracts)
## i Fold3: preprocessor 1/1, model 1/9 (predictions)
## i Fold3: preprocessor 1/1, model 2/9
## ✓ Fold3: preprocessor 1/1, model 2/9
## i Fold3: preprocessor 1/1, model 2/9 (extracts)
## i Fold3: preprocessor 1/1, model 2/9 (predictions)
## i Fold3: preprocessor 1/1, model 3/9
## ✓ Fold3: preprocessor 1/1, model 3/9
## i Fold3: preprocessor 1/1, model 3/9 (extracts)
## i Fold3: preprocessor 1/1, model 3/9 (predictions)
## i Fold3: preprocessor 1/1, model 4/9
## ✓ Fold3: preprocessor 1/1, model 4/9
## i Fold3: preprocessor 1/1, model 4/9 (extracts)
## i Fold3: preprocessor 1/1, model 4/9 (predictions)
## i Fold3: preprocessor 1/1, model 5/9
## ✓ Fold3: preprocessor 1/1, model 5/9
## i Fold3: preprocessor 1/1, model 5/9 (extracts)
## i Fold3: preprocessor 1/1, model 5/9 (predictions)
## i Fold3: preprocessor 1/1, model 6/9
## ✓ Fold3: preprocessor 1/1, model 6/9
## i Fold3: preprocessor 1/1, model 6/9 (extracts)
## i Fold3: preprocessor 1/1, model 6/9 (predictions)
## i Fold3: preprocessor 1/1, model 7/9
## ✓ Fold3: preprocessor 1/1, model 7/9
## i Fold3: preprocessor 1/1, model 7/9 (extracts)
## i Fold3: preprocessor 1/1, model 7/9 (predictions)
## i Fold3: preprocessor 1/1, model 8/9
## ✓ Fold3: preprocessor 1/1, model 8/9
## i Fold3: preprocessor 1/1, model 8/9 (extracts)
## i Fold3: preprocessor 1/1, model 8/9 (predictions)
## i Fold3: preprocessor 1/1, model 9/9
## ✓ Fold3: preprocessor 1/1, model 9/9
## i Fold3: preprocessor 1/1, model 9/9 (extracts)
## i Fold3: preprocessor 1/1, model 9/9 (predictions)
best_rf <- select_best(rf_tuned, metric = "rmse")
print(best_rf)
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 13 3 Preprocessor1_Model8
Ajustamos el modelo final con los mejores hiperparámetros y evaluamos en euros.
rf_final <- finalize_workflow(workflow(rec_rf, rf_spec), best_rf) %>%
fit(train_rf)
pred_eur <- predict(rf_final, test_rf)$.pred
rmse_rf_eur <- yardstick::rmse_vec(test_rf$price, pred_eur)
mae_rf_eur <- yardstick::mae_vec (test_rf$price, pred_eur)
cat("RMSE:", round(rmse_rf_eur,2),
"| MAE:", round(mae_rf_eur,2), "€\n")
## RMSE: 43.42 | MAE: 25.16 €
Creamos una versión logarítmica del modelo para reducir el impacto de outliers.
rec_rf_log <- recipe(price ~ ., data = train_rf) %>%
step_zv(all_predictors()) %>%
step_log(price, base = 10, skip = TRUE) %>%
step_log(host_listings_count, availability_30, availability_60,
availability_90, availability_365, offset = 1) %>%
step_normalize(all_numeric_predictors())
set.seed(42)
rf_tuned_log <- tune_grid(
workflow(rec_rf_log, rf_spec),
resamples = vfold_cv(train_rf, v = 3),
grid = rf_grid,
metrics = metric_set(yardstick::rmse),
control = control_grid(verbose = TRUE, allow_par = FALSE)
)
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/9
## ✓ Fold1: preprocessor 1/1, model 1/9
## i Fold1: preprocessor 1/1, model 1/9 (extracts)
## i Fold1: preprocessor 1/1, model 1/9 (predictions)
## i Fold1: preprocessor 1/1, model 2/9
## ✓ Fold1: preprocessor 1/1, model 2/9
## i Fold1: preprocessor 1/1, model 2/9 (extracts)
## i Fold1: preprocessor 1/1, model 2/9 (predictions)
## i Fold1: preprocessor 1/1, model 3/9
## ✓ Fold1: preprocessor 1/1, model 3/9
## i Fold1: preprocessor 1/1, model 3/9 (extracts)
## i Fold1: preprocessor 1/1, model 3/9 (predictions)
## i Fold1: preprocessor 1/1, model 4/9
## ✓ Fold1: preprocessor 1/1, model 4/9
## i Fold1: preprocessor 1/1, model 4/9 (extracts)
## i Fold1: preprocessor 1/1, model 4/9 (predictions)
## i Fold1: preprocessor 1/1, model 5/9
## ✓ Fold1: preprocessor 1/1, model 5/9
## i Fold1: preprocessor 1/1, model 5/9 (extracts)
## i Fold1: preprocessor 1/1, model 5/9 (predictions)
## i Fold1: preprocessor 1/1, model 6/9
## ✓ Fold1: preprocessor 1/1, model 6/9
## i Fold1: preprocessor 1/1, model 6/9 (extracts)
## i Fold1: preprocessor 1/1, model 6/9 (predictions)
## i Fold1: preprocessor 1/1, model 7/9
## ✓ Fold1: preprocessor 1/1, model 7/9
## i Fold1: preprocessor 1/1, model 7/9 (extracts)
## i Fold1: preprocessor 1/1, model 7/9 (predictions)
## i Fold1: preprocessor 1/1, model 8/9
## ✓ Fold1: preprocessor 1/1, model 8/9
## i Fold1: preprocessor 1/1, model 8/9 (extracts)
## i Fold1: preprocessor 1/1, model 8/9 (predictions)
## i Fold1: preprocessor 1/1, model 9/9
## ✓ Fold1: preprocessor 1/1, model 9/9
## i Fold1: preprocessor 1/1, model 9/9 (extracts)
## i Fold1: preprocessor 1/1, model 9/9 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/9
## ✓ Fold2: preprocessor 1/1, model 1/9
## i Fold2: preprocessor 1/1, model 1/9 (extracts)
## i Fold2: preprocessor 1/1, model 1/9 (predictions)
## i Fold2: preprocessor 1/1, model 2/9
## ✓ Fold2: preprocessor 1/1, model 2/9
## i Fold2: preprocessor 1/1, model 2/9 (extracts)
## i Fold2: preprocessor 1/1, model 2/9 (predictions)
## i Fold2: preprocessor 1/1, model 3/9
## ✓ Fold2: preprocessor 1/1, model 3/9
## i Fold2: preprocessor 1/1, model 3/9 (extracts)
## i Fold2: preprocessor 1/1, model 3/9 (predictions)
## i Fold2: preprocessor 1/1, model 4/9
## ✓ Fold2: preprocessor 1/1, model 4/9
## i Fold2: preprocessor 1/1, model 4/9 (extracts)
## i Fold2: preprocessor 1/1, model 4/9 (predictions)
## i Fold2: preprocessor 1/1, model 5/9
## ✓ Fold2: preprocessor 1/1, model 5/9
## i Fold2: preprocessor 1/1, model 5/9 (extracts)
## i Fold2: preprocessor 1/1, model 5/9 (predictions)
## i Fold2: preprocessor 1/1, model 6/9
## ✓ Fold2: preprocessor 1/1, model 6/9
## i Fold2: preprocessor 1/1, model 6/9 (extracts)
## i Fold2: preprocessor 1/1, model 6/9 (predictions)
## i Fold2: preprocessor 1/1, model 7/9
## ✓ Fold2: preprocessor 1/1, model 7/9
## i Fold2: preprocessor 1/1, model 7/9 (extracts)
## i Fold2: preprocessor 1/1, model 7/9 (predictions)
## i Fold2: preprocessor 1/1, model 8/9
## ✓ Fold2: preprocessor 1/1, model 8/9
## i Fold2: preprocessor 1/1, model 8/9 (extracts)
## i Fold2: preprocessor 1/1, model 8/9 (predictions)
## i Fold2: preprocessor 1/1, model 9/9
## ✓ Fold2: preprocessor 1/1, model 9/9
## i Fold2: preprocessor 1/1, model 9/9 (extracts)
## i Fold2: preprocessor 1/1, model 9/9 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/9
## ✓ Fold3: preprocessor 1/1, model 1/9
## i Fold3: preprocessor 1/1, model 1/9 (extracts)
## i Fold3: preprocessor 1/1, model 1/9 (predictions)
## i Fold3: preprocessor 1/1, model 2/9
## ✓ Fold3: preprocessor 1/1, model 2/9
## i Fold3: preprocessor 1/1, model 2/9 (extracts)
## i Fold3: preprocessor 1/1, model 2/9 (predictions)
## i Fold3: preprocessor 1/1, model 3/9
## ✓ Fold3: preprocessor 1/1, model 3/9
## i Fold3: preprocessor 1/1, model 3/9 (extracts)
## i Fold3: preprocessor 1/1, model 3/9 (predictions)
## i Fold3: preprocessor 1/1, model 4/9
## ✓ Fold3: preprocessor 1/1, model 4/9
## i Fold3: preprocessor 1/1, model 4/9 (extracts)
## i Fold3: preprocessor 1/1, model 4/9 (predictions)
## i Fold3: preprocessor 1/1, model 5/9
## ✓ Fold3: preprocessor 1/1, model 5/9
## i Fold3: preprocessor 1/1, model 5/9 (extracts)
## i Fold3: preprocessor 1/1, model 5/9 (predictions)
## i Fold3: preprocessor 1/1, model 6/9
## ✓ Fold3: preprocessor 1/1, model 6/9
## i Fold3: preprocessor 1/1, model 6/9 (extracts)
## i Fold3: preprocessor 1/1, model 6/9 (predictions)
## i Fold3: preprocessor 1/1, model 7/9
## ✓ Fold3: preprocessor 1/1, model 7/9
## i Fold3: preprocessor 1/1, model 7/9 (extracts)
## i Fold3: preprocessor 1/1, model 7/9 (predictions)
## i Fold3: preprocessor 1/1, model 8/9
## ✓ Fold3: preprocessor 1/1, model 8/9
## i Fold3: preprocessor 1/1, model 8/9 (extracts)
## i Fold3: preprocessor 1/1, model 8/9 (predictions)
## i Fold3: preprocessor 1/1, model 9/9
## ✓ Fold3: preprocessor 1/1, model 9/9
## i Fold3: preprocessor 1/1, model 9/9 (extracts)
## i Fold3: preprocessor 1/1, model 9/9 (predictions)
best_rf_log <- select_best(rf_tuned_log, metric = "rmse")
print(best_rf_log)
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 14 5 Preprocessor1_Model4
Ajustamos el modelo logarítmico y reconvertimos las predicciones a euros.
rf_final_log <- finalize_workflow(workflow(rec_rf_log, rf_spec), best_rf_log) %>%
fit(train_rf)
pred_log <- predict(rf_final_log, test_rf)$.pred
pred_eur_log <- 10 ^ pred_log
rmse_log_eur <- yardstick::rmse_vec(test_rf$price, pred_eur_log)
mae_log_eur <- yardstick::mae_vec (test_rf$price, pred_eur_log)
cat("RMSE log→€:", round(rmse_log_eur,2),
"| MAE:", round(mae_log_eur,2), "€\n")
## RMSE log→€: 44.15 | MAE: 23.86 €
Filtramos propiedades con precio ≤ 300 € y repetimos el ajuste.
rf_df300 <- rf_df %>% filter(price <= 300)
set.seed(42)
split_300 <- initial_split(rf_df300, prop = 0.8, strata = price)
train_300 <- training(split_300)
test_300 <- testing(split_300)
rec_300 <- recipe(price ~ ., data = train_300) %>%
step_zv(all_predictors()) %>%
step_log(host_listings_count, availability_30, availability_60,
availability_90, availability_365, offset = 1) %>%
step_normalize(all_numeric_predictors())
set.seed(42)
rf_tuned_300 <- tune_grid(
workflow(rec_300, rf_spec),
resamples = vfold_cv(train_300, v = 3),
grid = rf_grid,
metrics = metric_set(yardstick::rmse),
control = control_grid(verbose = TRUE, allow_par = FALSE)
)
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/9
## ✓ Fold1: preprocessor 1/1, model 1/9
## i Fold1: preprocessor 1/1, model 1/9 (extracts)
## i Fold1: preprocessor 1/1, model 1/9 (predictions)
## i Fold1: preprocessor 1/1, model 2/9
## ✓ Fold1: preprocessor 1/1, model 2/9
## i Fold1: preprocessor 1/1, model 2/9 (extracts)
## i Fold1: preprocessor 1/1, model 2/9 (predictions)
## i Fold1: preprocessor 1/1, model 3/9
## ✓ Fold1: preprocessor 1/1, model 3/9
## i Fold1: preprocessor 1/1, model 3/9 (extracts)
## i Fold1: preprocessor 1/1, model 3/9 (predictions)
## i Fold1: preprocessor 1/1, model 4/9
## ✓ Fold1: preprocessor 1/1, model 4/9
## i Fold1: preprocessor 1/1, model 4/9 (extracts)
## i Fold1: preprocessor 1/1, model 4/9 (predictions)
## i Fold1: preprocessor 1/1, model 5/9
## ✓ Fold1: preprocessor 1/1, model 5/9
## i Fold1: preprocessor 1/1, model 5/9 (extracts)
## i Fold1: preprocessor 1/1, model 5/9 (predictions)
## i Fold1: preprocessor 1/1, model 6/9
## ✓ Fold1: preprocessor 1/1, model 6/9
## i Fold1: preprocessor 1/1, model 6/9 (extracts)
## i Fold1: preprocessor 1/1, model 6/9 (predictions)
## i Fold1: preprocessor 1/1, model 7/9
## ✓ Fold1: preprocessor 1/1, model 7/9
## i Fold1: preprocessor 1/1, model 7/9 (extracts)
## i Fold1: preprocessor 1/1, model 7/9 (predictions)
## i Fold1: preprocessor 1/1, model 8/9
## ✓ Fold1: preprocessor 1/1, model 8/9
## i Fold1: preprocessor 1/1, model 8/9 (extracts)
## i Fold1: preprocessor 1/1, model 8/9 (predictions)
## i Fold1: preprocessor 1/1, model 9/9
## ✓ Fold1: preprocessor 1/1, model 9/9
## i Fold1: preprocessor 1/1, model 9/9 (extracts)
## i Fold1: preprocessor 1/1, model 9/9 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/9
## ✓ Fold2: preprocessor 1/1, model 1/9
## i Fold2: preprocessor 1/1, model 1/9 (extracts)
## i Fold2: preprocessor 1/1, model 1/9 (predictions)
## i Fold2: preprocessor 1/1, model 2/9
## ✓ Fold2: preprocessor 1/1, model 2/9
## i Fold2: preprocessor 1/1, model 2/9 (extracts)
## i Fold2: preprocessor 1/1, model 2/9 (predictions)
## i Fold2: preprocessor 1/1, model 3/9
## ✓ Fold2: preprocessor 1/1, model 3/9
## i Fold2: preprocessor 1/1, model 3/9 (extracts)
## i Fold2: preprocessor 1/1, model 3/9 (predictions)
## i Fold2: preprocessor 1/1, model 4/9
## ✓ Fold2: preprocessor 1/1, model 4/9
## i Fold2: preprocessor 1/1, model 4/9 (extracts)
## i Fold2: preprocessor 1/1, model 4/9 (predictions)
## i Fold2: preprocessor 1/1, model 5/9
## ✓ Fold2: preprocessor 1/1, model 5/9
## i Fold2: preprocessor 1/1, model 5/9 (extracts)
## i Fold2: preprocessor 1/1, model 5/9 (predictions)
## i Fold2: preprocessor 1/1, model 6/9
## ✓ Fold2: preprocessor 1/1, model 6/9
## i Fold2: preprocessor 1/1, model 6/9 (extracts)
## i Fold2: preprocessor 1/1, model 6/9 (predictions)
## i Fold2: preprocessor 1/1, model 7/9
## ✓ Fold2: preprocessor 1/1, model 7/9
## i Fold2: preprocessor 1/1, model 7/9 (extracts)
## i Fold2: preprocessor 1/1, model 7/9 (predictions)
## i Fold2: preprocessor 1/1, model 8/9
## ✓ Fold2: preprocessor 1/1, model 8/9
## i Fold2: preprocessor 1/1, model 8/9 (extracts)
## i Fold2: preprocessor 1/1, model 8/9 (predictions)
## i Fold2: preprocessor 1/1, model 9/9
## ✓ Fold2: preprocessor 1/1, model 9/9
## i Fold2: preprocessor 1/1, model 9/9 (extracts)
## i Fold2: preprocessor 1/1, model 9/9 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/9
## ✓ Fold3: preprocessor 1/1, model 1/9
## i Fold3: preprocessor 1/1, model 1/9 (extracts)
## i Fold3: preprocessor 1/1, model 1/9 (predictions)
## i Fold3: preprocessor 1/1, model 2/9
## ✓ Fold3: preprocessor 1/1, model 2/9
## i Fold3: preprocessor 1/1, model 2/9 (extracts)
## i Fold3: preprocessor 1/1, model 2/9 (predictions)
## i Fold3: preprocessor 1/1, model 3/9
## ✓ Fold3: preprocessor 1/1, model 3/9
## i Fold3: preprocessor 1/1, model 3/9 (extracts)
## i Fold3: preprocessor 1/1, model 3/9 (predictions)
## i Fold3: preprocessor 1/1, model 4/9
## ✓ Fold3: preprocessor 1/1, model 4/9
## i Fold3: preprocessor 1/1, model 4/9 (extracts)
## i Fold3: preprocessor 1/1, model 4/9 (predictions)
## i Fold3: preprocessor 1/1, model 5/9
## ✓ Fold3: preprocessor 1/1, model 5/9
## i Fold3: preprocessor 1/1, model 5/9 (extracts)
## i Fold3: preprocessor 1/1, model 5/9 (predictions)
## i Fold3: preprocessor 1/1, model 6/9
## ✓ Fold3: preprocessor 1/1, model 6/9
## i Fold3: preprocessor 1/1, model 6/9 (extracts)
## i Fold3: preprocessor 1/1, model 6/9 (predictions)
## i Fold3: preprocessor 1/1, model 7/9
## ✓ Fold3: preprocessor 1/1, model 7/9
## i Fold3: preprocessor 1/1, model 7/9 (extracts)
## i Fold3: preprocessor 1/1, model 7/9 (predictions)
## i Fold3: preprocessor 1/1, model 8/9
## ✓ Fold3: preprocessor 1/1, model 8/9
## i Fold3: preprocessor 1/1, model 8/9 (extracts)
## i Fold3: preprocessor 1/1, model 8/9 (predictions)
## i Fold3: preprocessor 1/1, model 9/9
## ✓ Fold3: preprocessor 1/1, model 9/9
## i Fold3: preprocessor 1/1, model 9/9 (extracts)
## i Fold3: preprocessor 1/1, model 9/9 (predictions)
best_rf_300 <- select_best(rf_tuned_300, metric = "rmse")
rf_final_300 <- finalize_workflow(workflow(rec_300, rf_spec), best_rf_300) %>%
fit(train_300)
pred_300_eur <- predict(rf_final_300, test_300)$.pred
rmse_300_eur <- yardstick::rmse_vec(test_300$price, pred_300_eur)
mae_300_eur <- yardstick::mae_vec (test_300$price, pred_300_eur)
cat("RMSE ≤300:", round(rmse_300_eur,2),
"| MAE:", round(mae_300_eur,2), "€\n")
## RMSE ≤300: 32.51 | MAE: 21.74 €
Ajustamos una versión logarítmica para el subconjunto ≤ 300 €.
rec_300_log <- recipe(price ~ ., data = train_300) %>%
step_zv(all_predictors()) %>%
step_log(price, base = 10, skip = TRUE) %>%
step_log(host_listings_count, availability_30, availability_60,
availability_90, availability_365, offset = 1) %>%
step_normalize(all_numeric_predictors())
set.seed(42)
rf_tuned_300_log <- tune_grid(
workflow(rec_300_log, rf_spec),
resamples = vfold_cv(train_300, v = 3),
grid = rf_grid,
metrics = metric_set(yardstick::rmse),
control = control_grid(verbose = TRUE, allow_par = FALSE)
)
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/9
## ✓ Fold1: preprocessor 1/1, model 1/9
## i Fold1: preprocessor 1/1, model 1/9 (extracts)
## i Fold1: preprocessor 1/1, model 1/9 (predictions)
## i Fold1: preprocessor 1/1, model 2/9
## ✓ Fold1: preprocessor 1/1, model 2/9
## i Fold1: preprocessor 1/1, model 2/9 (extracts)
## i Fold1: preprocessor 1/1, model 2/9 (predictions)
## i Fold1: preprocessor 1/1, model 3/9
## ✓ Fold1: preprocessor 1/1, model 3/9
## i Fold1: preprocessor 1/1, model 3/9 (extracts)
## i Fold1: preprocessor 1/1, model 3/9 (predictions)
## i Fold1: preprocessor 1/1, model 4/9
## ✓ Fold1: preprocessor 1/1, model 4/9
## i Fold1: preprocessor 1/1, model 4/9 (extracts)
## i Fold1: preprocessor 1/1, model 4/9 (predictions)
## i Fold1: preprocessor 1/1, model 5/9
## ✓ Fold1: preprocessor 1/1, model 5/9
## i Fold1: preprocessor 1/1, model 5/9 (extracts)
## i Fold1: preprocessor 1/1, model 5/9 (predictions)
## i Fold1: preprocessor 1/1, model 6/9
## ✓ Fold1: preprocessor 1/1, model 6/9
## i Fold1: preprocessor 1/1, model 6/9 (extracts)
## i Fold1: preprocessor 1/1, model 6/9 (predictions)
## i Fold1: preprocessor 1/1, model 7/9
## ✓ Fold1: preprocessor 1/1, model 7/9
## i Fold1: preprocessor 1/1, model 7/9 (extracts)
## i Fold1: preprocessor 1/1, model 7/9 (predictions)
## i Fold1: preprocessor 1/1, model 8/9
## ✓ Fold1: preprocessor 1/1, model 8/9
## i Fold1: preprocessor 1/1, model 8/9 (extracts)
## i Fold1: preprocessor 1/1, model 8/9 (predictions)
## i Fold1: preprocessor 1/1, model 9/9
## ✓ Fold1: preprocessor 1/1, model 9/9
## i Fold1: preprocessor 1/1, model 9/9 (extracts)
## i Fold1: preprocessor 1/1, model 9/9 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/9
## ✓ Fold2: preprocessor 1/1, model 1/9
## i Fold2: preprocessor 1/1, model 1/9 (extracts)
## i Fold2: preprocessor 1/1, model 1/9 (predictions)
## i Fold2: preprocessor 1/1, model 2/9
## ✓ Fold2: preprocessor 1/1, model 2/9
## i Fold2: preprocessor 1/1, model 2/9 (extracts)
## i Fold2: preprocessor 1/1, model 2/9 (predictions)
## i Fold2: preprocessor 1/1, model 3/9
## ✓ Fold2: preprocessor 1/1, model 3/9
## i Fold2: preprocessor 1/1, model 3/9 (extracts)
## i Fold2: preprocessor 1/1, model 3/9 (predictions)
## i Fold2: preprocessor 1/1, model 4/9
## ✓ Fold2: preprocessor 1/1, model 4/9
## i Fold2: preprocessor 1/1, model 4/9 (extracts)
## i Fold2: preprocessor 1/1, model 4/9 (predictions)
## i Fold2: preprocessor 1/1, model 5/9
## ✓ Fold2: preprocessor 1/1, model 5/9
## i Fold2: preprocessor 1/1, model 5/9 (extracts)
## i Fold2: preprocessor 1/1, model 5/9 (predictions)
## i Fold2: preprocessor 1/1, model 6/9
## ✓ Fold2: preprocessor 1/1, model 6/9
## i Fold2: preprocessor 1/1, model 6/9 (extracts)
## i Fold2: preprocessor 1/1, model 6/9 (predictions)
## i Fold2: preprocessor 1/1, model 7/9
## ✓ Fold2: preprocessor 1/1, model 7/9
## i Fold2: preprocessor 1/1, model 7/9 (extracts)
## i Fold2: preprocessor 1/1, model 7/9 (predictions)
## i Fold2: preprocessor 1/1, model 8/9
## ✓ Fold2: preprocessor 1/1, model 8/9
## i Fold2: preprocessor 1/1, model 8/9 (extracts)
## i Fold2: preprocessor 1/1, model 8/9 (predictions)
## i Fold2: preprocessor 1/1, model 9/9
## ✓ Fold2: preprocessor 1/1, model 9/9
## i Fold2: preprocessor 1/1, model 9/9 (extracts)
## i Fold2: preprocessor 1/1, model 9/9 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/9
## ✓ Fold3: preprocessor 1/1, model 1/9
## i Fold3: preprocessor 1/1, model 1/9 (extracts)
## i Fold3: preprocessor 1/1, model 1/9 (predictions)
## i Fold3: preprocessor 1/1, model 2/9
## ✓ Fold3: preprocessor 1/1, model 2/9
## i Fold3: preprocessor 1/1, model 2/9 (extracts)
## i Fold3: preprocessor 1/1, model 2/9 (predictions)
## i Fold3: preprocessor 1/1, model 3/9
## ✓ Fold3: preprocessor 1/1, model 3/9
## i Fold3: preprocessor 1/1, model 3/9 (extracts)
## i Fold3: preprocessor 1/1, model 3/9 (predictions)
## i Fold3: preprocessor 1/1, model 4/9
## ✓ Fold3: preprocessor 1/1, model 4/9
## i Fold3: preprocessor 1/1, model 4/9 (extracts)
## i Fold3: preprocessor 1/1, model 4/9 (predictions)
## i Fold3: preprocessor 1/1, model 5/9
## ✓ Fold3: preprocessor 1/1, model 5/9
## i Fold3: preprocessor 1/1, model 5/9 (extracts)
## i Fold3: preprocessor 1/1, model 5/9 (predictions)
## i Fold3: preprocessor 1/1, model 6/9
## ✓ Fold3: preprocessor 1/1, model 6/9
## i Fold3: preprocessor 1/1, model 6/9 (extracts)
## i Fold3: preprocessor 1/1, model 6/9 (predictions)
## i Fold3: preprocessor 1/1, model 7/9
## ✓ Fold3: preprocessor 1/1, model 7/9
## i Fold3: preprocessor 1/1, model 7/9 (extracts)
## i Fold3: preprocessor 1/1, model 7/9 (predictions)
## i Fold3: preprocessor 1/1, model 8/9
## ✓ Fold3: preprocessor 1/1, model 8/9
## i Fold3: preprocessor 1/1, model 8/9 (extracts)
## i Fold3: preprocessor 1/1, model 8/9 (predictions)
## i Fold3: preprocessor 1/1, model 9/9
## ✓ Fold3: preprocessor 1/1, model 9/9
## i Fold3: preprocessor 1/1, model 9/9 (extracts)
## i Fold3: preprocessor 1/1, model 9/9 (predictions)
best_rf_300_log <- select_best(rf_tuned_300_log, metric = "rmse")
rf_final_300_log <- finalize_workflow(workflow(rec_300_log, rf_spec),
best_rf_300_log) %>%
fit(train_300)
pred_300_log <- predict(rf_final_300_log, test_300)$.pred
pred_300_eur <- 10 ^ pred_300_log
rmse_300_log_eur <- yardstick::rmse_vec(test_300$price, pred_300_eur)
mae_300_log_eur <- yardstick::mae_vec (test_300$price, pred_300_eur)
cat("RMSE log→€ ≤300:", round(rmse_300_log_eur,2),
"| MAE:", round(mae_300_log_eur,2), "€\n")
## RMSE log→€ ≤300: 33.2 | MAE: 21.01 €
Creamos una tabla resumen con los resultados de todas las variantes del modelo.
results_rf <- tibble(
Escenario = c("€", "log→€", "€ ≤300", "log→€ ≤300"),
RMSE = c(rmse_rf_eur,
rmse_log_eur,
rmse_300_eur,
rmse_300_log_eur),
MAE = c(mae_rf_eur,
mae_log_eur,
mae_300_eur,
mae_300_log_eur),
PrecioMedio = c(mean(test_rf$price),
mean(test_rf$price),
mean(test_300$price),
mean(test_300$price))
) %>%
mutate(
MAPE = 100 * c(
yardstick::mape_vec(test_rf$price, pred_eur),
yardstick::mape_vec(test_rf$price, pred_eur_log),
yardstick::mape_vec(test_300$price, pred_300_eur),
yardstick::mape_vec(test_300$price, pred_300_eur)
),
NRMSE = 100 * RMSE / PrecioMedio,
NMAE = 100 * MAE / PrecioMedio
) %>%
select(Escenario, RMSE, MAE, MAPE, NRMSE, NMAE)
print(results_rf)
## # A tibble: 4 × 6
## Escenario RMSE MAE MAPE NRMSE NMAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 € 43.4 25.2 3779. 49.9 28.9
## 2 log→€ 44.2 23.9 3060. 50.8 27.4
## 3 € ≤300 32.5 21.7 3009. 41.3 27.6
## 4 log→€ ≤300 33.2 21.0 3009. 42.2 26.7
Visualizamos el RMSE absoluto y el RMSE normalizado (NRMSE) por escenario.
ggplot(results_rf, aes(Escenario, RMSE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = round(RMSE, 1)), vjust = -0.25, size = 4) +
labs(title = "Random Forest · RMSE absoluto", y = "RMSE (€)", x = "") +
theme_minimal()
ggplot(results_rf, aes(Escenario, MAE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = round(MAE, 2)), vjust = -0.3, size = 3.8) +
labs(title = "Random Forest · MAE absoluto (€)", y = "MAE (€)", x = "") +
theme_minimal()
ggplot(results_rf, aes(Escenario, NRMSE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = paste0(round(NRMSE, 1), "%")),
vjust = -0.25, size = 4) +
labs(title = "Random Forest · RMSE normalizado al precio medio",
y = "NRMSE (%)", x = "") +
theme_minimal()
ggplot(results_rf, aes(Escenario, NMAE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = paste0(round(NMAE, 1), "%")),
vjust = -0.3, size = 3.8) +
labs(title = "Random Forest · MAE normalizado (%)",
y = "MAE / Precio medio (%)", x = "") +
theme_minimal()
Determinamos cuál es el mejor modelo de cada tipo
# Índices del mejor modelo según RMSE
mejor_knn_idx <- which.min(results_knn$RMSE)
mejor_rf_idx <- which.min(results_rf$RMSE)
# Listas de predicciones y valores reales para cada variante de k-NN
knn_preds <- list(pred_eur, pred_eur_from_log, pred_300, pred_eur_300)
knn_reales <- list(y_test, y_test, y_te2, y_te2)
# Selección del mejor modelo k-NN
mejor_knn_pred <- knn_preds[[mejor_knn_idx]]
mejor_knn_real <- knn_reales[[mejor_knn_idx]]
# Listas de predicciones y valores reales para cada variante de Random Forest
rf_preds <- list(pred_eur, pred_eur_log, pred_300_eur, pred_300_eur)
rf_reales <- list(test_rf$price, test_rf$price, test_300$price, test_300$price)
# Selección del mejor modelo Random Forest
mejor_rf_pred <- rf_preds[[mejor_rf_idx]]
mejor_rf_real <- rf_reales[[mejor_rf_idx]]
length(mejor_knn_pred)
## [1] 3190
length(mejor_rf_pred)
## [1] 3193
Gráfico: k-NN · Real vs Predicho (modelo con menor RMSE)
results_knn_plot <- data.frame(real = mejor_knn_real, pred = mejor_knn_pred)
ggplot(results_knn_plot, aes(x = real, y = pred)) +
geom_point(alpha = 0.5, color = "blue") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "k-NN: Precio real vs predicho (mejor modelo)",
x = "Precio real (€)", y = "Precio predicho (€)") +
theme_minimal()
Gráfico: Random Forest · Real vs Predicho (modelo con menor RMSE)
results_rf_plot <- data.frame(real = mejor_rf_real, pred = mejor_rf_pred)
ggplot(results_rf_plot, aes(x = real, y = pred)) +
geom_point(alpha = 0.5, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Random Forest: Precio real vs predicho (mejor modelo)",
x = "Precio real (€)", y = "Precio predicho (€)") +
theme_minimal()
Importancia de variables en el mejor modelo Random Forest
rf_model_fit <- extract_fit_parsnip(
list(rf_final, rf_final_log, rf_final_300, rf_final_300_log)[[mejor_rf_idx]]
)$fit
importancia <- rf_model_fit$variable.importance %>%
as.data.frame() %>%
rownames_to_column("Variable") %>%
rename(Importancia = ".")
importancia <- importancia %>%
arrange(desc(Importancia))
ggplot(importancia, aes(x = reorder(Variable, Importancia), y = Importancia)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Importancia de variables (mejor modelo Random Forest)",
x = "Variable", y = "Importancia") +
theme_minimal()
# Creamos tabla con errores
errores_rf <- data.frame(
real = mejor_rf_real,
pred = mejor_rf_pred
) %>%
mutate(
error_abs = abs(real - pred),
error_pct = abs(real - pred) / real * 100
)
# Seleccionamos el conjunto base
rf_bases <- list(test_rf, test_rf, test_300, test_300)
test_rf_mejor <- rf_bases[[mejor_rf_idx]]
# Unimos con las predicciones y errores
errores_rf <- test_rf_mejor %>%
mutate(
predicho = mejor_rf_pred,
error_abs = abs(price - predicho),
error_pct = abs(price - predicho) / price * 100
)
errores_top <- errores_rf %>%
arrange(desc(error_abs)) %>%
slice_head(n = 10)
print(errores_top)
## accommodates bedrooms bathrooms beds n_amenities room_type_normalizado
## 1 1 1 1.5 1 18 0
## 2 2 1 1.0 1 14 0
## 3 4 2 1.5 3 39 1
## 4 2 1 1.0 1 11 0
## 5 2 1 2.0 1 19 0
## 6 4 3 2.0 3 60 0
## 7 6 3 1.0 3 29 1
## 8 4 2 1.0 3 9 1
## 9 5 3 2.0 3 15 1
## 10 6 3 2.0 5 16 1
## ascensor_normalizado garaje_propio_normalizado adicionales_normalizado
## 1 1 0 0
## 2 1 0 0
## 3 0 0 0
## 4 0 0 0
## 5 1 0 0
## 6 1 0 0
## 7 0 0 0
## 8 1 0 0
## 9 1 0 1
## 10 0 1 0
## Las_Ramblas total_transporte_publico total_centros_salud_distrito
## 1 3.4005934 468 180
## 2 4.5924503 268 160
## 3 0.8447678 194 136
## 4 2.3547739 468 180
## 5 1.5428391 227 193
## 6 2.2243656 227 193
## 7 1.3702908 194 136
## 8 1.0244819 525 535
## 9 3.4696095 426 252
## 10 3.3025065 505 371
## total_parques_jardines_distrito criminalidad_distrito availability_30
## 1 62 21919 0
## 2 17 12148 0
## 3 21 44455 30
## 4 62 21919 22
## 5 23 8588 30
## 6 23 8588 0
## 7 21 44455 3
## 8 116 46754 2
## 9 54 25408 0
## 10 49 9955 28
## availability_60 availability_90 availability_365 minimum_nights
## 1 0 0 0 4
## 2 0 0 0 2
## 3 60 90 365 32
## 4 52 82 357 3
## 5 60 90 365 4
## 6 0 0 0 5
## 7 11 18 293 3
## 8 12 22 122 3
## 9 8 31 221 31
## 10 58 88 363 32
## maximum_nights host_listings_count price predicho error_abs error_pct
## 1 6 1 300 37.10434 262.8957 87.63189
## 2 5 1 275 38.06384 236.9362 86.15860
## 3 1125 1 300 71.70087 228.2991 76.09971
## 4 1125 6 250 60.51786 189.4821 75.79285
## 5 30 5 250 63.76812 186.2319 74.49275
## 6 1125 1 250 63.77199 186.2280 74.49120
## 7 1125 1 300 114.41413 185.5859 61.86196
## 8 180 6 300 121.90947 178.0905 59.36351
## 9 60 2 260 85.44722 174.5528 67.13568
## 10 1125 224 300 128.55231 171.4477 57.14923
ggplot(errores_rf, aes(x = price, y = predicho)) +
geom_point(alpha = 0.3) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
geom_point(data = errores_top, aes(x = price, y = predicho), color = "red", size = 2) +
labs(title = "Errores más altos en Random Forest",
x = "Precio real (€)", y = "Precio predicho (€)") +
theme_minimal()
write.csv(errores_rf, "errores_rf.csv", row.names = FALSE)
data_clean <- data_clean %>%
filter(
(room_type == "Entire home/apt" & price <= 600) |
(room_type == "Private room" & price <= 80)
)
table(data_clean$room_type)
##
## Entire home/apt Private room
## 7933 7663
summary(data_clean$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 39.00 60.00 85.92 100.00 599.00
knn_df <- data_clean %>%
select(all_of(vars_pred), price) %>%
drop_na()
preproc <- preProcess(knn_df[, vars_pred], method = c("center", "scale"))
X_scaled <- predict(preproc, knn_df[, vars_pred])
y <- knn_df$price
set.seed(42)
train_id <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X_scaled[train_id, ]
X_test <- X_scaled[-train_id, ]
y_train <- y[train_id]
y_test <- y[-train_id]
Realizamos una validación cruzada para encontrar el mejor valor de k.
k_vals <- 3:35
rmse_cv <- numeric(length(k_vals))
folds <- createFolds(y_train, k = 5, returnTrain = TRUE)
for (i in seq_along(k_vals)) {
k <- k_vals[i]
rmse_fold <- numeric(length(folds))
for (j in seq_along(folds)) {
id_tr <- folds[[j]]
id_te <- setdiff(seq_along(y_train), id_tr)
pred_cv <- knn.reg(
train = X_train[id_tr, ], test = X_train[id_te, ],
y = y_train[id_tr], k = k)$pred
rmse_fold[j] <- rmse_vec(y_train[id_te], pred_cv)
}
rmse_cv[i] <- mean(rmse_fold)
}
best_k <- k_vals[which.min(rmse_cv)]
print(glue("Mejor k según CV: {best_k} (RMSE {round(min(rmse_cv),2)} €)"))
## Mejor k según CV: 11 (RMSE 46.91 €)
ggplot(data.frame(k = k_vals, RMSE = rmse_cv),
aes(k, RMSE)) +
geom_line() + geom_point() +
geom_vline(xintercept = best_k, linetype = "dashed") +
labs(title = "Búsqueda del k óptimo (CV 5-fold)",
x = "Número de vecinos (k)", y = "RMSE (€)")
Evaluamos el modelo en el conjunto de test, en euros.
pred_eur <- knn.reg(
train = X_train, test = X_test, y = y_train, k = best_k)$pred
rmse_eur <- rmse_vec(y_test, pred_eur)
mae_eur <- mae_vec (y_test, pred_eur)
cat("RMSE:", round(rmse_eur,2),
"| MAE:", round(mae_eur,2), "€\n")
## RMSE: 48.54 | MAE: 27.36 €
Evaluamos el modelo con la transformación logarítmica y reconversión a euros.
y_train_log <- log10(y_train)
y_test_log <- log10(y_test)
pred_log <- knn.reg(
train = X_train, test = X_test,
y = y_train_log, k = best_k)$pred
rmse_log <- rmse_vec(y_test_log, pred_log)
mae_log <- mae_vec (y_test_log, pred_log)
pred_eur_from_log <- 10 ^ pred_log
rmse_eur_from_log <- rmse_vec(y_test, pred_eur_from_log)
mae_eur_from_log <- mae_vec (y_test, pred_eur_from_log)
mult <- 10 ^ rmse_log
cat(glue::glue(
" ↳ RMSE(€): {round(rmse_eur_from_log,2)} €\n",
" ↳ MAE(€): {round(mae_eur_from_log ,2)} €\n"
))
## ↳ RMSE(€): 49.7 €
## ↳ MAE(€): 26.74 €
Filtramos el conjunto para propiedades con precio menor o igual a 300 € y evaluamos.
sel <- knn_df$price <= 300
X_filt <- X_scaled[sel, ]
y_filt <- y[sel]
set.seed(42)
id_tr2 <- createDataPartition(y_filt, p = 0.8, list = FALSE)
X_tr2 <- X_filt[id_tr2, ]; X_te2 <- X_filt[-id_tr2, ]
y_tr2 <- y_filt[id_tr2]; y_te2 <- y_filt[-id_tr2]
pred_300 <- knn.reg(train = X_tr2, test = X_te2,
y = y_tr2, k = best_k)$pred
rmse_300 <- rmse_vec(y_te2, pred_300)
mae_300 <- mae_vec (y_te2, pred_300)
cat("RMSE:", round(rmse_300,2),
"| MAE:", round(mae_300,2), "€\n")
## RMSE: 35.17 | MAE: 23.18 €
Aplicamos log10 para el conjunto filtrado y reconvertimos a euros.
y_tr2_log <- log10(y_tr2)
y_te2_log <- log10(y_te2)
pred_log_300 <- knn.reg(
train = X_tr2, test = X_te2,
y = y_tr2_log, k = best_k)$pred
rmse_log_300 <- rmse_vec(y_te2_log, pred_log_300)
mae_log_300 <- mae_vec (y_te2_log, pred_log_300)
pred_eur_300 <- 10 ^ pred_log_300
rmse_eur_300 <- rmse_vec(y_te2, pred_eur_300)
mae_eur_300 <- mae_vec (y_te2, pred_eur_300)
mult_300 <- 10 ^ rmse_log_300
cat(glue::glue(
"RMSE(log): {round(rmse_log_300,3)} (×{round(mult_300,2)})\n",
"RMSE(€): {round(rmse_eur_300,2)} € | MAE(€): {round(mae_eur_300,2)} €\n"
))
## RMSE(log): 0.175 (×1.5)
## RMSE(€): 35.47 € | MAE(€): 22.71 €
Construimos la tabla resumen de resultados en diferentes escenarios.
results_knn <- tibble(
Escenario = c("€", "log→€", "€ ≤300", "log→€ ≤300"),
RMSE = c(rmse_eur, rmse_eur_from_log, rmse_300, rmse_eur_300),
MAE = c(mae_eur, mae_eur_from_log, mae_300, mae_eur_300),
Media_Precio = c(mean(y_test), mean(y_test), mean(y_te2), mean(y_te2))
) %>%
mutate(
MAPE = 100 * c(
mape_vec(y_test, pred_eur),
mape_vec(y_test, pred_eur_from_log),
mape_vec(y_te2, pred_300),
mape_vec(y_te2, pred_eur_300)
),
NRMSE = 100 * RMSE / Media_Precio,
NMAE = 100 * MAE / Media_Precio
) %>%
select(Escenario, RMSE, MAE, MAPE, NRMSE, NMAE)
print(results_knn)
## # A tibble: 4 × 6
## Escenario RMSE MAE MAPE NRMSE NMAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 € 48.5 27.4 3769. 56.0 31.6
## 2 log→€ 49.7 26.7 3375. 57.4 30.9
## 3 € ≤300 35.2 23.2 3768. 45.7 30.1
## 4 log→€ ≤300 35.5 22.7 3451. 46.1 29.5
Visualizamos los resultados de RMSE absoluto y normalizado por escenario.
ggplot(results_knn, aes(Escenario, RMSE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = round(RMSE,2)), vjust = -0.3, size = 3.8) +
labs(title = "k-NN · RMSE absoluto (€)", y = "RMSE (€)", x = "") +
theme_minimal()
ggplot(results_knn, aes(Escenario, MAE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = round(MAE,2)), vjust = -0.3, size = 3.8) +
labs(title = "k-NN · MAE absoluto (€)", y = "MAE (€)", x = "") +
theme_minimal()
ggplot(results_knn, aes(Escenario, NRMSE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = paste0(round(NRMSE,1), "%")),
vjust = -0.3, size = 3.8) +
labs(title = "k-NN · RMSE normalizado (%)",
y = "RMSE / Precio medio (%)", x = "") +
theme_minimal()
ggplot(results_knn, aes(Escenario, NMAE, fill = Escenario)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = paste0(round(NMAE,1), "%")),
vjust = -0.3, size = 3.8) +
labs(title = "k-NN · MAE normalizado (%)",
y = "MAE / Precio medio (%)", x = "") +
theme_minimal()
data_aptos <- data_clean %>%
filter(room_type == "Entire home/apt")
data_aptos$room_type_normalizado <- 1
data_aptos$host_is_superhost <- as.integer(data_aptos$host_is_superhost == "t")
knn_df_aptos <- data_aptos %>%
select(all_of(vars_pred), price) %>%
drop_na()
preproc_aptos <- preProcess(knn_df_aptos[, vars_pred], method = c("center", "scale"))
## Warning in preProcess.default(knn_df_aptos[, vars_pred], method = c("center", :
## These variables have zero variances: room_type_normalizado
X_scaled_aptos <- predict(preproc_aptos, knn_df_aptos[, vars_pred])
y_aptos <- knn_df_aptos$price
set.seed(42)
train_id_aptos <- createDataPartition(y_aptos, p = 0.8, list = FALSE)
X_train_aptos <- X_scaled_aptos[train_id_aptos, ]
X_test_aptos <- X_scaled_aptos[-train_id_aptos, ]
y_train_aptos <- y_aptos[train_id_aptos]
y_test_aptos <- y_aptos[-train_id_aptos]
k_vals <- 3:35
rmse_cv <- numeric(length(k_vals))
folds <- createFolds(y_train_aptos, k = 5, returnTrain = TRUE)
for (i in seq_along(k_vals)) {
k <- k_vals[i]
rmse_fold <- numeric(length(folds))
for (j in seq_along(folds)) {
id_tr <- folds[[j]]
id_te <- setdiff(seq_along(y_train_aptos), id_tr)
pred_cv <- knn.reg(
train = X_train_aptos[id_tr, ], test = X_train_aptos[id_te, ],
y = y_train_aptos[id_tr], k = k)$pred
rmse_fold[j] <- rmse_vec(y_train_aptos[id_te], pred_cv)
}
rmse_cv[i] <- mean(rmse_fold)
}
best_k_aptos <- k_vals[which.min(rmse_cv)]
ggplot(data.frame(k = k_vals, RMSE = rmse_cv),
aes(k, RMSE)) +
geom_line() + geom_point() +
geom_vline(xintercept = best_k_aptos, linetype = "dashed") +
labs(title = "Apartamentos · Búsqueda del k óptimo",
x = "Número de vecinos (k)", y = "RMSE (€)")
pred_aptos <- knn.reg(train = X_train_aptos, test = X_test_aptos, y = y_train_aptos, k = best_k_aptos)$pred
rmse_eur_aptos <- rmse_vec(y_test_aptos, pred_aptos)
mae_eur_aptos <- mae_vec (y_test_aptos, pred_aptos)
pred_log_aptos <- knn.reg(
train = X_train_aptos, test = X_test_aptos,
y = log10(y_train_aptos), k = best_k_aptos)$pred
pred_eur_from_log_aptos <- 10 ^ pred_log_aptos
rmse_eur_from_log_aptos <- rmse_vec(y_test_aptos, pred_eur_from_log_aptos)
mae_eur_from_log_aptos <- mae_vec (y_test_aptos, pred_eur_from_log_aptos)
results_knn_aptos <- tibble(
Escenario = c("€", "log→€"),
RMSE = c(rmse_eur_aptos, rmse_eur_from_log_aptos),
MAE = c(mae_eur_aptos, mae_eur_from_log_aptos),
Media_Precio = mean(y_test_aptos)
) %>%
mutate(
MAPE = 100 * c(
mape_vec(y_test_aptos, pred_aptos),
mape_vec(y_test_aptos, pred_eur_from_log_aptos)
),
NRMSE = 100 * RMSE / Media_Precio,
NMAE = 100 * MAE / Media_Precio,
Tipo = "Apartamento"
)
data_priv <- data_clean %>%
filter(room_type == "Private room")
data_priv$room_type_normalizado <- 0
data_priv$host_is_superhost <- as.integer(data_priv$host_is_superhost == "t")
knn_df_priv <- data_priv %>%
select(all_of(vars_pred), price) %>%
drop_na()
preproc_priv <- preProcess(knn_df_priv[, vars_pred], method = c("center", "scale"))
## Warning in preProcess.default(knn_df_priv[, vars_pred], method = c("center", :
## These variables have zero variances: room_type_normalizado
X_scaled_priv <- predict(preproc_priv, knn_df_priv[, vars_pred])
y_priv <- knn_df_priv$price
set.seed(42)
train_id_priv <- createDataPartition(y_priv, p = 0.8, list = FALSE)
X_train_priv <- X_scaled_priv[train_id_priv, ]
X_test_priv <- X_scaled_priv[-train_id_priv, ]
y_train_priv <- y_priv[train_id_priv]
y_test_priv <- y_priv[-train_id_priv]
k_vals <- 3:35
rmse_cv <- numeric(length(k_vals))
folds <- createFolds(y_train_priv, k = 5, returnTrain = TRUE)
for (i in seq_along(k_vals)) {
k <- k_vals[i]
rmse_fold <- numeric(length(folds))
for (j in seq_along(folds)) {
id_tr <- folds[[j]]
id_te <- setdiff(seq_along(y_train_priv), id_tr)
pred_cv <- knn.reg(
train = X_train_priv[id_tr, ], test = X_train_priv[id_te, ],
y = y_train_priv[id_tr], k = k)$pred
rmse_fold[j] <- rmse_vec(y_train_priv[id_te], pred_cv)
}
rmse_cv[i] <- mean(rmse_fold)
}
best_k_priv <- k_vals[which.min(rmse_cv)]
ggplot(data.frame(k = k_vals, RMSE = rmse_cv),
aes(k, RMSE)) +
geom_line() + geom_point() +
geom_vline(xintercept = best_k_priv, linetype = "dashed") +
labs(title = "Habitaciones privadas · Búsqueda del k óptimo",
x = "Número de vecinos (k)", y = "RMSE (€)")
pred_priv <- knn.reg(train = X_train_priv, test = X_test_priv, y = y_train_priv, k = best_k_priv)$pred
rmse_eur_priv <- rmse_vec(y_test_priv, pred_priv)
mae_eur_priv <- mae_vec (y_test_priv, pred_priv)
pred_log_priv <- knn.reg(
train = X_train_priv, test = X_test_priv,
y = log10(y_train_priv), k = best_k_priv)$pred
pred_eur_from_log_priv <- 10 ^ pred_log_priv
rmse_eur_from_log_priv <- rmse_vec(y_test_priv, pred_eur_from_log_priv)
mae_eur_from_log_priv <- mae_vec (y_test_priv, pred_eur_from_log_priv)
results_knn_priv <- tibble(
Escenario = c("€", "log→€"),
RMSE = c(rmse_eur_priv, rmse_eur_from_log_priv),
MAE = c(mae_eur_priv, mae_eur_from_log_priv),
Media_Precio = mean(y_test_priv)
) %>%
mutate(
MAPE = 100 * c(
mape_vec(y_test_priv, pred_priv),
mape_vec(y_test_priv, pred_eur_from_log_priv)
),
NRMSE = 100 * RMSE / Media_Precio,
NMAE = 100 * MAE / Media_Precio,
Tipo = "Habitación privada"
)
results_knn_total <- bind_rows(results_knn_aptos, results_knn_priv)
ggplot(results_knn_total, aes(Escenario, RMSE, fill = Tipo)) +
geom_col(position = "dodge") +
geom_text(aes(label = round(RMSE,2)), vjust = -0.3, position = position_dodge(0.9), size = 3.5) +
labs(title = "k-NN · RMSE por tipo de alojamiento", y = "RMSE (€)", x = "") +
theme_minimal()
ggplot(results_knn_total, aes(Escenario, NRMSE, fill = Tipo)) +
geom_col(position = "dodge") +
geom_text(aes(label = paste0(round(NRMSE,1), "%")), vjust = -0.3, position = position_dodge(0.9), size = 3.5) +
labs(title = "k-NN · RMSE normalizado por tipo", y = "NRMSE (%)", x = "") +
theme_minimal()
ggplot(results_knn_total, aes(Escenario, NRMSE, fill = Tipo)) +
geom_col(position = "dodge") +
geom_text(aes(label = paste0(round(NRMSE, 1), "%")),
vjust = -0.3, position = position_dodge(0.9), size = 3.5) +
labs(title = "k-NN · RMSE normalizado (% del precio)",
y = "NRMSE (%)", x = "") +
theme_minimal()
ggplot(results_knn_total, aes(Escenario, NMAE, fill = Tipo)) +
geom_col(position = "dodge") +
geom_text(aes(label = paste0(round(NMAE, 1), "%")),
vjust = -0.3, position = position_dodge(0.9), size = 3.5) +
labs(title = "k-NN · MAE normalizado (% del precio)",
y = "NMAE (%)", x = "") +
theme_minimal()
A lo largo del análisis se ha observado que el MAE se mantiene elevado, incluso al aplicar distintos modelos (k-NN, Random Forest) y escalas (lineal y logarítmica), e incluso tras una cuidadosa limpieza del conjunto de datos (eliminación de outliers por tipo de habitación). Esto sugiere que existen limitaciones estructurales en los datos disponibles, que afectan a la capacidad predictiva de cualquier modelo.
El precio lo define libremente el anfitrión A diferencia de variables físicas como número de habitaciones o ubicación, el precio es subjetivo y puede responder a criterios no observables:
Expectativas personales del anfitrión
Estrategias de posicionamiento
Cambios de precio sin reflejarse en otros campos
Falta de información sobre el contenido visual del anuncio Las fotos juegan un papel crucial en la percepción de valor, pero este análisis no incluye métricas de calidad visual ni estética. ➤ Dos pisos similares en características pueden tener precios muy diferentes si uno tiene fotos profesionales y otro no.
Se trabaja con anuncios, no con reservas reales El precio listado puede no ser el que realmente se paga. Muchos anuncios usan precios “de atracción” que luego suben dinámicamente. ➤ Lo ideal sería contar con datos de reservas efectivas, con precio final pagado por noche.
Alojamientos únicos y difícilmente comparables Aunque se agrupen por tipo, siguen existiendo muchas combinaciones únicas:
Vistas especiales
Decoración temática
Servicios especiales no capturados en las variables disponibles
Poca información sobre demanda real Aunque se usan variables como availability_30, no se cuenta con métricas explícitas de tasa de ocupación, número de reservas o cancelaciones, lo que limitaría la capacidad del modelo de entender el valor percibido.
Recomendaciones futuras
Incluir variables adicionales como calidad de fotos
Acceder a bases de datos con historial de reservas reales para trabajar con precios efectivos.
Explorar modelos de series temporales o dinámicos si se cuenta con precios en distintos momentos del año.